:; #-*- mode: nendo; syntax: scheme -*-;; ;;; ;;; jisyo-db.nnd - 辞書DBの構築、辞書DBアクセスのライブラリ ;;; ;;; Copyright (c) 2010 Kiyoka Nishiyama ;;; ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: ;;; ;;; 1. Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. ;;; ;;; 2. Redistributions in binary form must reproduce the above copyright ;;; notice, this list of conditions and the following disclaimer in the ;;; documentation and/or other materials provided with the distribution. ;;; ;;; 3. Neither the name of the authors nor the names of its contributors ;;; may be used to endorse or promote products derived from this ;;; software without specific prior written permission. ;;; ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;; ;;; $Id: ;;; (use srfi-1) (require "progressbar") (require "sekka/kvs") (require "sekka/sekkaversion") (use sekka.util) (define masterid "MASTER") (define workid "WORK") (define versionid "SEKKA::VERSION") (define alphabet-string "abcdefghijklmnopqrstuvwxyz>@;#") (define alphabet-lower-list (to-list (alphabet-string.split ""))) (define alphabet-upper-list (to-list (. (sekka-upcase alphabet-string) split ""))) (define alphabet-pairs (append-map (lambda (a) (map (lambda (b) (+ a b)) (append alphabet-upper-list alphabet-lower-list))) alphabet-lower-list)) ;; KVS type setting (define *kvs-type* 'tokyocabinet) ;; default (define (set-kvs-type type) (if (not (symbol? type)) (error "Error: set-kvs-type requires symbol argument.") (set! *kvs-type* type))) (define (get-kvs-type) *kvs-type*) ;; dict-get search sequence ;; (1) try "userid::keyword" key ;; (2) try "MASTER::keyword" key (define (dict-get userid kvs key . fallback) (if-let1 value (or (kvs.get (+ userid "::" key) #f) (kvs.get (+ masterid "::" key) #f)) value (let1 opt (get-optional fallback #f) opt))) (define (create-2char-hash keylist) (define okuri-ari-hash (make-hash-table)) (define okuri-nashi-hash (make-hash-table)) (define hiragana-phrase-hash (make-hash-table)) (define (create-hash keylist) (for-each (lambda (k) (when (rxmatch #/^[=a-zA-Z#^>-@`\;+:'\-]+$/ k) (if (rxmatch #/^=/ k) ;; HIRAGANA-PHRASE (let1 sliced (sekka-downcase (k.slice 1 2)) (when (= 2 sliced.size) (hash-table-push! hiragana-phrase-hash sliced k))) ;; OKURI-ARI and OKURI-NASHI (let1 sliced (sekka-downcase (k.slice 0 2)) (when (= 2 sliced.size) (if (rxmatch #/[A-Z`+]$/ k) (hash-table-push! okuri-ari-hash sliced k) (hash-table-push! okuri-nashi-hash sliced k))))))) keylist)) (create-hash keylist) (values okuri-ari-hash okuri-nashi-hash hiragana-phrase-hash)) (define (setup-ready-made-keylist kvs keylist) (for-each (lambda (key) (let1 key (+ masterid "::" key) (unless (kvs.get key #f) (kvs.put! key "")))) alphabet-pairs) (receive (okuri-ari-hash okuri-nashi-hash hiragana-phrase-hash) (create-2char-hash keylist) ;; OKURI-ARI (for-each (lambda (key) (kvs.put! (+ masterid "::" "(" (sekka-upcase key) ")") (string-join (uniq (sort (hash-table-get okuri-ari-hash key))) " "))) (hash-table-keys okuri-ari-hash)) ;; OKURI-NASHI (for-each (lambda (key) (kvs.put! (+ masterid "::" "(" (sekka-downcase key) ")") (string-join (uniq (sort (hash-table-get okuri-nashi-hash key))) " "))) (hash-table-keys okuri-nashi-hash)) ;; HIRAGANA-PHRASE (for-each (lambda (key) (kvs.put! (+ masterid "::" "{" (sekka-downcase key) "}") (string-join (uniq (sort (hash-table-get hiragana-phrase-hash key))) " "))) (hash-table-keys hiragana-phrase-hash)))) (define (append-entry userid kvs _key value) (let1 key (+ userid "::" _key) (cond ((rxmatch #/^=/ _key) ;; hiragana-phrase entry (kvs.put! key value)) (else (if-let1 got (kvs.get key #f) (let* ((trimmed-a-first1 (car (string-split-first-and-rest got))) (trimmed-a (if-let1 m (rxmatch #/^[C\/](.+)$/ got) (rxmatch-substring m 1) got)) (trimmed-b-first1 (car (string-split-first-and-rest value))) (trimmed-b (if-let1 m (rxmatch #/^[C\/](.+)$/ value) (rxmatch-substring m 1) value)) (lst (delete-duplicates (append (to-list (trimmed-a.split "/")) (to-list (trimmed-b.split "/")))))) (if (= trimmed-a-first1 trimmed-b-first1) (kvs.put! key (+ trimmed-a-first1 (string-join lst "/"))) (begin ;; "/" よりも "C" を優先する (cond ((= "C" trimmed-a-first1) ;; aを優先 (kvs.put! key got)) ((= "C" trimmed-b-first1) ;; bを優先 (kvs.put! key value)) (else (errorf "Error: [%s][%s] entry is wrong format" key value)))))) (kvs.put! key value)))))) (define (load-sekka-jisyo-f f filename) (define keylist '()) (define (create-keylist kvs f) (for-each (lambda (line) (let1 fields (split-dict-line line) (set! keylist (cons (first fields) keylist)) (append-entry masterid kvs (first fields) (second fields)))) (f.readlines))) (let1 kvs (Kvs.new (get-kvs-type)) (kvs.open filename) (kvs.clear) (create-keylist kvs f) (setup-ready-made-keylist kvs keylist) (kvs.put! versionid SekkaVersion.version) (kvs.close))) (define (dump-sekka-jisyo-f f filename) (let1 kvs (Kvs.new (get-kvs-type)) (kvs.open filename) (for-each (lambda (key) (f.puts (+ key "\t" (kvs.get key)))) (kvs.keys.sort)) (kvs.close))) (define (restore-sekka-jisyo-f f filename-or-hostname) (define (file-length f) (let1 total 0 (for-each (lambda (x) (set! total (+ 1 total))) f) (f.rewind) total)) (define (restore f kvs progress) (let loop ([line (f.readline.chomp)]) (let* ( [key-value (to-list (line.split #/[\t]+/))] [key (first key-value)] [value (if (> 2 (length key-value)) "" (second key-value))]) (kvs.put! key value) (progress.inc)) (when (not (f.eof)) (loop (f.readline.chomp))))) (let1 total (file-length f) (let ([kvs (Kvs.new (get-kvs-type))] [progress (ProgressBar.new "Restore" total STDERR)]) (kvs.open filename-or-hostname) (kvs.clear) (restore f kvs progress) (kvs.close)))) (define (openSekkaJisyo dictType dictSource cacheSource) (set-kvs-type dictType) (let1 kvs (Kvs.new (get-kvs-type)) (kvs.open #?=dictSource) (let1 cachesv (if cacheSource (let1 obj (Kvs.new 'memcache) (obj.open cacheSource) obj) #f) ;; 読み書きできるか調べる (let ([d (. (Date.new 0) to_s)]) (if (and (kvs.pure_put! "key_for_ping" d) (string=? (kvs.get "key_for_ping") d)) (STDERR.puts "Info: database file is clean") ;; 正常に読み書きできないようであれば、データベースを修復する (begin (STDERR.puts "Info: database file is NOT clean. try to fix...") (kvs.fixdb) (STDERR.puts "Info: done.")))) (to-arr (list kvs cachesv))))) ;; Export to Ruby world (export-to-ruby openSekkaJisyo)