:; #-*- mode: nendo; syntax: scheme -*-;; ;;; ;;; henkan.nnd - 変換エンジンのコア ;;; ;;; Copyright (c) 2010 Kiyoka Nishiyama ;;; ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: ;;; ;;; 1. Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. ;;; ;;; 2. Redistributions in binary form must reproduce the above copyright ;;; notice, this list of conditions and the following disclaimer in the ;;; documentation and/or other materials provided with the distribution. ;;; ;;; 3. Neither the name of the authors nor the names of its contributors ;;; may be used to endorse or promote products derived from this ;;; software without specific prior written permission. ;;; ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;; ;;; $Id: ;;; (use srfi-1) (use rfc.json) (use sekka.util) (use sekka.convert-jisyo) (use sekka.jisyo-db) (use sekka.alphabet-lib) (use sekka.sharp-number) (use sekka.google-ime) (require "sekka/approximatesearch") ;; 曖昧検索の評価値にヒューリスティックを加える ;; "nn" のように n が重なるほど、"ん" の可能性が強くなる。 ;; "にゃ" "にゅ" "にょ" が出現した場合は、n がそちらの解釈に取られたものと解釈してその件数分引いておく。 (define (nn-heuristics keyword lst) (map (lambda (x) (let ((point (first x)) (roman (second x)) (target (third x))) (let* ((nn-number (min (vector-length (keyword.scan #/nn/)) (vector-length (roman.scan #/nn/)) (- (vector-length (target.scan #/ん/)) (vector-length (target.scan #/[ゃゅょ]/))))) (nn-number (max 0 nn-number))) (list (if (= 0 nn-number) point (* point (* 1.1 nn-number))) roman target)))) lst)) (define (flatten-vector vec) (vec.flatten)) ;; 曖昧検索 ;; リストで返す。 (ソート済み検索結果) (define (approximate-search userid kvs keyword type limit) (let ([jarow-threshold (if (= "h" type) 0.975 0.94)]) (let* ((a-search (ApproximateSearch.new jarow-threshold)) (lst (to-list (flatten-vector (map (lambda (x) (let* ((val (vector-ref x 0)) (k (vector-ref x 1)) (v (dict-get userid kvs k "")) (v-pair (string-split-first-and-rest v)) (v-first (car v-pair)) (v-rest (cdr v-pair))) (map (lambda (element) (list val k (+ v-first element))) (v-rest.split "/")))) (a-search.search userid kvs keyword type))))) (nn-heuristics-result (nn-heuristics keyword lst)) (sorted-lst (sort-by nn-heuristics-result (lambda (item) (- 1.0 (car item)))))) (if (= limit 0) sorted-lst (take* sorted-lst limit))))) ;; 厳密検索 (define (exact-search userid kvs keyword) (let1 v (dict-get userid kvs keyword #f) (if v `((1.0 ,keyword ,v)) '()))) ;; split "/a;annotation/b/c/" into `(("a" "annotation" ,src) ("b" #f ,src) ("c" #f ,src)) alist (define (split-henkan-kouho str src . okuri) (define (append-j-type lst) (map (lambda (x) (append x (list 'j))) lst)) (let* ((trimmed (cond ((rxmatch #/^[\/](.+)$/ str) => (lambda (m) (rxmatch-substring m 1))) (else ""))) (splitted (filter (lambda (str) (< 0 str.size)) (to-list (trimmed.split "/")))) (okuri (get-optional okuri ""))) ;; pickup annotation (append-j-type (map (lambda (entry) (let1 vec (entry.split ";") (list (+ (vector-ref vec 0) okuri) (vector-ref vec 1 #f) src))) splitted)))) ;; 送り仮名なしの変換 (define (henkan-okuri-nashi userid kvs keyword limit) (let* ((result (approximate-search userid kvs keyword "k" limit)) (kouho (uniq (map (lambda (x) (third x)) result)))) (let1 lst (append-map (lambda (value) (if-let1 m (rxmatch #/^C(.+)$/ value) (split-henkan-kouho (dict-get userid kvs (string-drop value 1)) (rxmatch-substring m 1) ) ;; continue to fetch (split-henkan-kouho value keyword))) kouho) (if (= limit 0) lst (take* lst limit))))) ;; 送り仮名なし、かつ、数字で始まるキーワードの変換 (define (henkan-okuri-nashi-and-number userid kvs keyword limit) ;; 数字部分を抜きだして '#' に変換 (let ([num-list (to-list (keyword.scan #/[0-9]+/))] [replaced (keyword.gsub #/[0-9]+/ "#")]) ;; 辞書引きした結果の # 部分を 再度数値に戻す。 (let1 result (henkan-okuri-nashi userid kvs replaced limit) (map (lambda (entry) (let* ([type-list (to-list (. (car entry) scan #/[#][0-9]/))] [converted-list (map (lambda (_type _num) (sekka-henkan-sharp-number _type _num)) type-list num-list)] [format-str (. (car entry) gsub #/[#][0-9]/ "%s")]) (cons (apply sprintf (cons format-str converted-list)) (append (drop-right (cdr entry) 1) (list 'n))))) result)))) ;; 送り仮名ありの変換 (define (henkan-okuri-ari userid kvs keyword limit roman-method) (let* ((keyword (let1 _pair (string-split-first-and-rest keyword) (+ (sekka-downcase (car _pair)) (cdr _pair)))) (m (rxmatch #/^([a-z])([a-z\-^]*)([A-Z`+])([a-zA-Z]*)$/ keyword))) (if (not m) '() ;; 変換候補無し (let* ((result (approximate-search userid kvs (+ (rxmatch-substring m 1) (rxmatch-substring m 2) (rxmatch-substring m 3)) "K" limit)) (okurigana-lst (gen-roman->hiragana (sekka-downcase (+ (rxmatch-substring m 3) (rxmatch-substring m 4))) roman-method)) (okurigana-lst (if (null? okurigana-lst) '("") okurigana-lst)) (kouho (uniq (map (lambda (x) (third x)) result)))) (let1 lst (append-map (lambda (value) (append-map (lambda (x) (if-let1 m (rxmatch #/^C(.+)$/ value) (split-henkan-kouho (dict-get userid kvs (string-drop value 1)) (rxmatch-substring m 1) x) ;; continue to fetch (split-henkan-kouho value keyword x))) okurigana-lst)) kouho) (if (= limit 0) lst (take* lst limit))))))) ;; 平仮名フレーズ辞書を使った曖昧検索 (define (henkan-hiragana-phrase userid kvs keyword limit roman-method) (let* ([result (approximate-search userid kvs (+ "=" (sekka-downcase keyword)) "h" limit)] [uniq-result (delete-duplicates (map (lambda (x) (third x)) result))] [lst (map (lambda (x) (list x #f keyword 'h)) uniq-result)]) (if (= limit 0) lst (take* lst limit)))) ;; 平仮名の変換 (define (henkan-hiragana userid kvs keyword roman-method) (let* ([phrase-limit 3] [str (sekka-downcase keyword)] [hira (gen-roman->hiragana str roman-method)] [kata (gen-roman->katakana str roman-method)]) (append (henkan-hiragana-phrase userid kvs keyword phrase-limit roman-method) (if (null? hira) `((,keyword #f ,keyword j)) (append-map (lambda (h k) `( (,h #f ,keyword h) (,k #f ,keyword k))) hira kata))))) ;; アルファベットの単純変換 (define (henkan-alphabet kvs keyword) (let* ((zen (gen-alphabet-han->zen keyword)) (han (gen-alphabet-zen->han keyword))) `( (,zen #f ,keyword z) (,han #f ,keyword l)))) ;; 記号を含むキーワードの変換 (define (henkan-non-kanji userid kvs keyword) (let* ((result (exact-search userid kvs keyword)) (kouho (map (lambda (x) (third x)) result))) (append-map (lambda (value) (split-henkan-kouho value keyword)) kouho))) ;; 数字だけで構成されるキーワードの変換 (define (henkan-number kvs keyword) ;; 数字部分を抜きだして '#' に変換 `( (,(sekka-henkan-sharp-number "#1" keyword) nil ,keyword n) (,(sekka-henkan-sharp-number "#0" keyword) nil ,keyword n) (,(sekka-henkan-sharp-number "#2" keyword) nil ,keyword n) (,(sekka-henkan-sharp-number "#3" keyword) nil ,keyword n))) ;; keyword には ローマ字のみを受け付ける (define (sekka-henkan userid kvs keyword limit roman-method) ;; 変換候補にindex番号を付加する (define (append-index-number kouho-list) (let1 count 0 (map (lambda (x) (begin0 (append x (list count)) (set! count (+ count 1)))) kouho-list))) ;; キーワードの種別で処理を分割する (append-index-number (cond ((rxmatch #/[A-Z`+]/ keyword) (let1 k (string-downcase-first keyword) (cond ((rxmatch #/[a-z][A-Z`+]/ k) ;; 送りあり (append (append (henkan-okuri-ari userid kvs k limit roman-method) (if (null? (gen-roman->hiragana (sekka-downcase k) roman-method)) '() (henkan-hiragana userid kvs (sekka-downcase k) roman-method))) (henkan-alphabet kvs keyword))) (else ;; 送りなし (append (append (henkan-okuri-nashi userid kvs k limit) (if (null? (gen-roman->hiragana (sekka-downcase k) roman-method)) '() (henkan-hiragana userid kvs (sekka-downcase k) roman-method))) (henkan-alphabet kvs keyword)))))) ;; 10進数数値のみで構成されるキーワード ((rxmatch #/^[0-9]+$/ keyword) (henkan-number kvs keyword)) ;; 10進数数値で始まるキーワード ((rxmatch #/^[0-9]+[0-9a-zA-Z@-]+$/ keyword) (append (henkan-okuri-nashi-and-number userid kvs keyword limit) (henkan-alphabet kvs keyword))) ((not (null? (gen-roman->hiragana keyword roman-method))) (append (append (henkan-hiragana userid kvs keyword roman-method) (henkan-alphabet kvs keyword)) (henkan-okuri-nashi userid kvs keyword limit))) (else (append (henkan-non-kanji userid kvs keyword) (henkan-alphabet kvs keyword)))))) ;; conversion #f => nil for EmacsLisp (define (sekkaHenkan userid kvs cachesv keyword limit roman-method) (define cache-exp-second (* 60 60)) (let* ((keyword (keyword.strip)) (sekka-keyword (+ "sekka::" roman-method "::" (limit.to_s) "::" keyword))) (if-let1 fetched (and cachesv (cachesv.get sekka-keyword #f)) (begin keyword (read-from-string fetched)) (let1 henkan-result (map (lambda (x) (map (lambda (val) (if val val nil)) x)) (sekka-henkan userid kvs keyword limit (make-keyword roman-method))) (and cachesv (not (null? henkan-result)) (let1 fetched2 (cachesv.get "sekka::(keys)" #f) (cachesv.put! sekka-keyword (write-to-string henkan-result) cache-exp-second) (cachesv.put! "sekka::(keys)" (if fetched2 (+ fetched2 " " sekka-keyword) sekka-keyword) cache-exp-second) (if-let1 v (cachesv.get "sekka::(keys)" #f) (begin #?=(v.size) #?=v) #f))) henkan-result)))) ;; Export to Ruby world (export-to-ruby sekkaHenkan) ;; Flush henkan-result cache data on cachesv(memcached) (define (flushCacheServer cachesv) #?="--- do (flushCacheServer) ---" (if-let1 fetched (cachesv.get "sekka::(keys)" #f) (begin (for-each (lambda (x) (cachesv.delete #?=x)) (to-list (fetched.split #/[ ]+/))) (cachesv.delete "sekka::(keys)")) #f)) ;; Export to Ruby world (export-to-ruby flushCacheServer) ;; 確定処理: 最終確定語を変換候補の先頭に持ってくる。 ;; key ... "developer" や "へんかん" など、変換候補レコードのキーとなる文字列 ;; tango ... "変換" など、変換候補から最終確定した、変換候補の文字列 ;; 登録失敗したら nil を返す (define (sekkaKakutei userid kvs cachesv key tango) (define (fetch userid kvs key) (dict-get userid kvs key #f)) (define (write-user-entry userid kvs key value) (kvs.put! (+ userid "::" key) value)) (define (join-henkan-kouho lst) (+ "/" (string-join (map (lambda (x) (if (second x) (+ (first x) ";" (second x)) (first x))) lst) "/"))) (let1 tango (if (is-hiragana-and-okuri key) (drop-okuri tango) tango) (if-let1 kouho-str (fetch userid kvs key) (if (rxmatch #/^\// kouho-str) (let* ((kouho-lst (split-henkan-kouho kouho-str key)) (no1 (filter (lambda (x) (eq? (car x) tango)) kouho-lst)) (other (filter (lambda (x) (not (eq? (car x) tango))) kouho-lst)) (new-kouho-str (join-henkan-kouho (append no1 other)))) (if (not (= kouho-str new-kouho-str)) (begin (write-user-entry userid kvs key new-kouho-str) (and cachesv (flushCacheServer cachesv)) tango) nil)) nil) nil))) ;; Export to Ruby world (export-to-ruby sekkaKakutei) ;; ユーザー辞書の全削除 (useridで指定したユーザーの辞書) ;; 削除した件数を返す。 (define (flushUserJisyo userid kvs) (let1 lst (to-list (kvs.forward_match_keys (+ userid "::"))) (for-each (lambda (k) (kvs.delete k)) lst) (length lst))) ;; Export to Ruby world (export-to-ruby flushUserJisyo) ;; ユーザー定義語彙の登録処理 (define (registerUserJisyo userid kvs dict-line) (define user-keylist '()) (define (insert-to-db sekka-jisyo-data) (for-each (lambda (entry) (let* ((kv (to-list (entry.split #/[ ]+/))) (k (first kv)) (v (second kv))) (set! user-keylist (cons k user-keylist)) (append-entry userid kvs k v))) sekka-jisyo-data)) (define (_create-ready-made-keylist keylist) (receive (okuri-ari-hash okuri-nashi-hash hiragana-phrase-hash) (create-2char-hash keylist) ;; OKURI-ARI (for-each (lambda (key) (let1 fetched (kvs.get (+ userid "::" "(" (sekka-upcase key) ")") (kvs.get (+ masterid "::" "(" (sekka-upcase key) ")") "")) (kvs.put! (+ userid "::" "(" (sekka-upcase key) ")") (string-join (uniq (sort (append (to-list (fetched.split #/[ ]+/)) (hash-table-get okuri-ari-hash key)))) " ")))) (hash-table-keys okuri-ari-hash)) ;; OKURI-NASHI (for-each (lambda (key) (let1 fetched (kvs.get (+ userid "::" "(" (sekka-downcase key) ")") (kvs.get (+ masterid "::" "(" (sekka-downcase key) ")") "")) (kvs.put! (+ userid "::" "(" (sekka-downcase key) ")") (string-join (uniq (sort (append (to-list (fetched.split #/[ ]+/)) (hash-table-get okuri-nashi-hash key)))) " ")))) (hash-table-keys okuri-nashi-hash)) ;; HIRAGANA-PHRASE (for-each (lambda (key) (let1 fetched (kvs.get (+ userid "::" "{" (sekka-downcase key) "}") (kvs.get (+ masterid "::" "{" (sekka-downcase key) "}") "")) (kvs.put! (+ userid "::" "{" (sekka-downcase key) "}") (string-join (uniq (sort (append (to-list (fetched.split #/[ ]+/)) (hash-table-get hiragana-phrase-hash key)))) " ")))) (hash-table-keys hiragana-phrase-hash)))) ;; "ユーザー語彙を"(stored)"にpush!する" (define (kvs-push! userid kvs entry-str) (let* ((orig (kvs.get (+ userid "::(stored)") "()")) (orig (read-from-string orig))) (kvs.put! (+ userid "::(stored)") (write-to-string (append orig (list entry-str)))) entry-str)) (cond ((memv dict-line (read-from-string (kvs.get (+ userid "::(stored)") "()"))) #?=(+ "already stored userid=[" userid "] tango=[" dict-line "]") #f) (else (cond ((rxmatch #/[ ]+[\/]/ dict-line) (let1 lst (convert-skk-jisyo-f (StringIO.new (+ dict-line "\n"))) (if (null? lst) (begin #?=(+ "Error user dict format error userid=[" userid "] tango=[" dict-line "]") #f) (begin (insert-to-db lst) (_create-ready-made-keylist user-keylist) (kvs-push! userid kvs dict-line) #?=(+ "user dict stored userid=[" userid "] tango=[" dict-line "]") #t)))) (else #?=(+ "Error user dict format error userid=[" userid "] tango=[" dict-line "]") #f))))) ;; Export to Ruby world (export-to-ruby registerUserJisyo) ;; ユーザー定義語彙の登録処理 (define (googleIme keyword proxy-host proxy-port) (request-google-ime keyword #f proxy-host proxy-port)) ;; Export to Ruby world (export-to-ruby googleIme)