:; #-*- mode: nendo; syntax: scheme -*-;; ;;; ;;; jisyo-db.nnd - 辞書DBの構築、辞書DBアクセスのライブラリ ;;; ;;; Copyright (c) 2010 Kiyoka Nishiyama ;;; ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: ;;; ;;; 1. Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. ;;; ;;; 2. Redistributions in binary form must reproduce the above copyright ;;; notice, this list of conditions and the following disclaimer in the ;;; documentation and/or other materials provided with the distribution. ;;; ;;; 3. Neither the name of the authors nor the names of its contributors ;;; may be used to endorse or promote products derived from this ;;; software without specific prior written permission. ;;; ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;; ;;; $Id: ;;; (use srfi-1) (use util.list) (require "ruby-progressbar") (require "sekka/kvs") (require "sekka/sekkaversion") (require "distributedtrie") (use sekka.util) (define master-prefix "M") (define version-prefix "SEKKA:VERSION") (define okuri-ari-prefix "IK") (define okuri-nashi-prefix "Ik") (define okuri-nashi-reverse-prefix "Ir") (define hiragana-phrase-prefix "Ih") (define progressbar-format "%E |%B| %p%% %t") (define okuri-nashi-reverse-min-length 6) ;; KVS type setting (define *kvs-type* 'tokyocabinet) ;; default (define (set-kvs-type type) (if (not (symbol? type)) (error "Error: set-kvs-type requires symbol argument.") (set! *kvs-type* type))) (define (get-kvs-type) *kvs-type*) ;; dict-get search sequence ;; (1) try "userid:keyword" key ;; (2) try "M:keyword" key (define (dict-get userid kvs key . fallback) (if-let1 value (or (kvs.get (+ userid ":" key) #f) (kvs.get (+ master-prefix ":" key) #f)) value (let1 opt (get-optional fallback #f) opt))) (define (setup-trie-index userid kvs keylist displayProgress) (define (trie-add-autocommit trie keylist progress) (let ([i 0]) (for-each (lambda (k) (trie.addKey! k) (set! i (+ i 1)) (when (= 0 (% i 100)) (trie.commit!)) (when progress (progress.increment))) keylist)) trie.commit!) (define (grouping-keylist keylist progress) (let ([okuri-ari-list '()] [okuri-nashi-list '()] [hiragana-phrase-list '()]) (for-each (lambda (k) (if (rxmatch #/^[=a-zA-Z#^>-@`\;+:'\-]+$/ k) (if (rxmatch #/^=/ k) ;; HIRAGANA-PHRASE (push! hiragana-phrase-list k) ;; OKURI-ARI and OKURI-NASHI (if (rxmatch #/[A-Z`+]$/ k) (push! okuri-ari-list k) (push! okuri-nashi-list k)))) (when progress (progress.increment))) keylist) (values okuri-ari-list okuri-nashi-list hiragana-phrase-list))) (let1 progress1 (if displayProgress (ProgressBar.create (alist->hash-table `( [title . "trie-tree(1)"] [total . ,(length keylist)] [output . ,STDERR] [format . ,progressbar-format] ))) #f) (receive (okuri-ari-list okuri-nashi-list hiragana-phrase-list) (grouping-keylist keylist progress1) (let1 okuri-nashi-reverse-list (filter-map (lambda (str) (if (or (< str.length okuri-nashi-reverse-min-length) (rxmatch #/[#]/ str)) #f (str.reverse))) okuri-nashi-list) (let1 progress2 (if displayProgress (ProgressBar.create (alist->hash-table `( [title . "trie-tree(2)"] [total . ,(+ (length okuri-ari-list) (length okuri-nashi-list) (length okuri-nashi-reverse-list) (length hiragana-phrase-list))] [output . ,STDERR] [format . ,progressbar-format] ))) #f) (set! keylist '()) ;; free memory (let1 trie (DistributedTrie::Trie.new kvs (+ okuri-ari-prefix ":" userid ":")) (trie-add-autocommit trie okuri-ari-list progress2)) (set! okuri-ari-list '()) ;; free memory (let1 trie (DistributedTrie::Trie.new kvs (+ okuri-nashi-prefix ":" userid ":")) (trie-add-autocommit trie okuri-nashi-list progress2)) (set! okuri-nashi-list '()) ;; free memory (let1 trie (DistributedTrie::Trie.new kvs (+ okuri-nashi-reverse-prefix ":" userid ":")) (trie-add-autocommit trie okuri-nashi-reverse-list progress2)) (set! okuri-nashi-reverse-list '()) ;; free memory (let1 trie (DistributedTrie::Trie.new kvs (+ hiragana-phrase-prefix ":" userid ":")) (trie-add-autocommit trie hiragana-phrase-list progress2)) (set! hiragana-phrase-list '()) ;; free memory ))))) (define (append-entry userid kvs _key value) (let1 key (+ userid ":" _key) (cond ((rxmatch #/^=/ _key) ;; hiragana-phrase entry (kvs.put! key value)) (else (if-let1 got (kvs.get key #f) (let* ((trimmed-a-first1 (car (string-split-first-and-rest got))) (trimmed-a (if-let1 m (rxmatch #/^[C\/](.+)$/ got) (rxmatch-substring m 1) got)) (trimmed-b-first1 (car (string-split-first-and-rest value))) (trimmed-b (if-let1 m (rxmatch #/^[C\/](.+)$/ value) (rxmatch-substring m 1) value)) (lst (delete-duplicates (append (to-list (trimmed-a.split "/")) (to-list (trimmed-b.split "/")))))) (if (= trimmed-a-first1 trimmed-b-first1) (kvs.put! key (+ trimmed-a-first1 (string-join lst "/"))) (begin ;; "/" よりも "C" を優先する (cond ((= "C" trimmed-a-first1) ;; aを優先 (kvs.put! key got)) ((= "C" trimmed-b-first1) ;; bを優先 (kvs.put! key value)) (else (errorf "Error: [%s][%s] entry is wrong format" key value)))))) (kvs.put! key value)))))) (define (load-sekka-jisyo-f f filename) (define keylist '()) (define (create-keylist kvs f) (let1 progress (ProgressBar.create (alist->hash-table `( [title . "data-body "] [total . ,(file-length f)] [output . ,STDERR] [format . ,progressbar-format] ))) (for-each (lambda (line) (let* ( [line (line.chomp)] [fields (split-dict-line line)]) (push! keylist (first fields)) (append-entry master-prefix kvs (first fields) (second fields))) (progress.increment)) f))) (let1 kvs (Kvs.new (get-kvs-type)) (kvs.open filename) (kvs.clear) (create-keylist kvs f) (setup-trie-index master-prefix kvs keylist #t) (kvs.put! version-prefix SekkaVersion.dictVersion) (kvs.close))) (define (dump-sekka-jisyo-f f filename) (let1 kvs (Kvs.new (get-kvs-type)) (kvs.open filename) (for-each (lambda (key) (f.puts (+ key "\t" (kvs.get key)))) (kvs.keys.sort)) (kvs.close))) (define (restore-sekka-jisyo-f f filename-or-hostname) (define (restore f kvs progress) (let loop ([line (f.readline.chomp)]) (let* ( [key-value (to-list (line.split #/[\t]+/))] [key (first key-value)] [value (if (> 2 (length key-value)) "" (second key-value))]) (kvs.put! key value) (progress.increment)) (when (not (f.eof)) (loop (f.readline.chomp))))) (let1 total (file-length f) (let ([kvs (Kvs.new (get-kvs-type))] [progress (ProgressBar.create (alist->hash-table `( [title . "restore "] [total . ,total] [output . ,STDERR] [format . ,progressbar-format] )))]) (f.set_encoding "utf-8" "utf-8") (kvs.open filename-or-hostname) (kvs.clear) (restore f kvs progress) (kvs.close)))) (define (openSekkaJisyo dictType dictSource cacheSource) (set-kvs-type dictType) (let1 kvs (Kvs.new (get-kvs-type)) (kvs.open #?=dictSource) (let1 cachesv (if cacheSource (let1 obj (Kvs.new 'memcache) (obj.open cacheSource) obj) #f) ;; 読み書きできるか調べる (let ([d (. (Date.new 0) to_s)]) (if (and (kvs.pure_put! "key_for_ping" d) (string=? (kvs.get "key_for_ping") d)) #?="Info: database file is clean" ;; 正常に読み書きできないようであれば、データベースを修復する (begin #?="Info: database file is NOT clean. try to fix..." (kvs.fixdb) #?="Info: done."))) (to-arr (list kvs cachesv))))) ;; Export to Ruby world (export-to-ruby openSekkaJisyo)