docs/classes/Ankusa/Classifier.html in ankusa-0.0.5 vs docs/classes/Ankusa/Classifier.html in ankusa-0.0.6

- old
+ new

@@ -86,14 +86,17 @@ <h3 class="section-bar">Methods</h3> <div class="name-list"> <a href="#M000007">classifications</a>&nbsp;&nbsp; <a href="#M000006">classify</a>&nbsp;&nbsp; - <a href="#M000008">get_word_probs</a>&nbsp;&nbsp; + <a href="#M000010">doc_count_totals</a>&nbsp;&nbsp; + <a href="#M000009">get_word_probs</a>&nbsp;&nbsp; + <a href="#M000008">log_likelihoods</a>&nbsp;&nbsp; <a href="#M000003">new</a>&nbsp;&nbsp; <a href="#M000004">train</a>&nbsp;&nbsp; <a href="#M000005">untrain</a>&nbsp;&nbsp; + <a href="#M000011">vocab_sizes</a>&nbsp;&nbsp; </div> </div> </div> @@ -156,37 +159,32 @@ <div id="method-M000007" class="method-detail"> <a name="M000007"></a> <div class="method-heading"> <a href="#M000007" class="method-signature"> - <span class="method-name">classifications</span><span class="method-args">(text)</span> + <span class="method-name">classifications</span><span class="method-args">(text, classnames=nil)</span> </a> </div> <div class="method-description"> + <p> +Classes is an array of classes to look at +</p> <p><a class="source-toggle" href="#" onclick="toggleCode('M000007-source');return false;">[Source]</a></p> <div class="method-source-code" id="M000007-source"> <pre> -<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 46</span> - <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>) - <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span> - - <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span> - <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>) - <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) } +<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 53</span> + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>) + <span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span> + <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> + <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] } - <span class="ruby-comment cmt"># add the prior and exponentiate</span> - <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> - <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_total</span>.<span class="ruby-identifier">to_f</span>) - <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]) - } - <span class="ruby-comment cmt"># normalize to get probs</span> <span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } - <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> } + <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> } <span class="ruby-identifier">result</span> <span class="ruby-keyword kw">end</span> </pre> </div> </div> @@ -195,29 +193,70 @@ <div id="method-M000006" class="method-detail"> <a name="M000006"></a> <div class="method-heading"> <a href="#M000006" class="method-signature"> - <span class="method-name">classify</span><span class="method-args">(text)</span> + <span class="method-name">classify</span><span class="method-args">(text, classes=nil)</span> </a> </div> <div class="method-description"> <p><a class="source-toggle" href="#" onclick="toggleCode('M000006-source');return false;">[Source]</a></p> <div class="method-source-code" id="M000006-source"> <pre> -<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 41</span> - <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>) +<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 47</span> + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword kw">nil</span>) <span class="ruby-comment cmt"># return the most probable class</span> - <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span> + <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span> <span class="ruby-keyword kw">end</span> </pre> </div> </div> </div> + <div id="method-M000008" class="method-detail"> + <a name="M000008"></a> + + <div class="method-heading"> + <a href="#M000008" class="method-signature"> + <span class="method-name">log_likelihoods</span><span class="method-args">(text, classnames=nil)</span> + </a> + </div> + + <div class="method-description"> + <p> +Classes is an array of classes to look at +</p> + <p><a class="source-toggle" href="#" + onclick="toggleCode('M000008-source');return false;">[Source]</a></p> + <div class="method-source-code" id="M000008-source"> +<pre> +<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 66</span> + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>) + <span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span> + <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span> + + <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span> + <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>) + <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) } + } + + <span class="ruby-comment cmt"># add the prior and exponentiate</span> + <span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> } + <span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span> + <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> + <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>) + } + + <span class="ruby-identifier">result</span> + <span class="ruby-keyword kw">end</span> +</pre> + </div> + </div> + </div> + <div id="method-M000004" class="method-detail"> <a name="M000004"></a> <div class="method-heading"> <a href="#M000004" class="method-signature"> @@ -242,10 +281,13 @@ } <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span> <span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">doccount</span> <span class="ruby-ivar">@classnames</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span> + <span class="ruby-comment cmt"># cache is now dirty of these vars</span> + <span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span> + <span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span> <span class="ruby-identifier">th</span> <span class="ruby-keyword kw">end</span> </pre> </div> </div> @@ -266,50 +308,101 @@ </p> <p><a class="source-toggle" href="#" onclick="toggleCode('M000005-source');return false;">[Source]</a></p> <div class="method-source-code" id="M000005-source"> <pre> -<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 29</span> +<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 32</span> <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>) <span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>) <span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">count</span> <span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span> } <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span> <span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">doccount</span> + <span class="ruby-comment cmt"># cache is now dirty of these vars</span> + <span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span> + <span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span> <span class="ruby-identifier">th</span> <span class="ruby-keyword kw">end</span> </pre> </div> </div> </div> <h3 class="section-bar">Protected Instance methods</h3> - <div id="method-M000008" class="method-detail"> - <a name="M000008"></a> + <div id="method-M000010" class="method-detail"> + <a name="M000010"></a> <div class="method-heading"> - <a href="#M000008" class="method-signature"> - <span class="method-name">get_word_probs</span><span class="method-args">(word)</span> + <a href="#M000010" class="method-signature"> + <span class="method-name">doc_count_totals</span><span class="method-args">()</span> </a> </div> <div class="method-description"> <p><a class="source-toggle" href="#" - onclick="toggleCode('M000008-source');return false;">[Source]</a></p> - <div class="method-source-code" id="M000008-source"> + onclick="toggleCode('M000010-source');return false;">[Source]</a></p> + <div class="method-source-code" id="M000010-source"> <pre> -<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 67</span> - <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>) - <span class="ruby-identifier">probs</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>) - <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span> +<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 97</span> + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_totals</span> + <span class="ruby-ivar">@doc_count_totals</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_totals</span> + <span class="ruby-keyword kw">end</span> +</pre> + </div> + </div> + </div> + + <div id="method-M000009" class="method-detail"> + <a name="M000009"></a> + + <div class="method-heading"> + <a href="#M000009" class="method-signature"> + <span class="method-name">get_word_probs</span><span class="method-args">(word, classnames)</span> + </a> + </div> + + <div class="method-description"> + <p><a class="source-toggle" href="#" + onclick="toggleCode('M000009-source');return false;">[Source]</a></p> + <div class="method-source-code" id="M000009-source"> +<pre> +<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 86</span> + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>) + <span class="ruby-identifier">probs</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span> + <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">v</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> } + <span class="ruby-identifier">vs</span> = <span class="ruby-identifier">vocab_sizes</span> + <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span> <span class="ruby-comment cmt"># use a laplacian smoother</span> - <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> + <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-identifier">vs</span>[<span class="ruby-identifier">cn</span>]).<span class="ruby-identifier">to_f</span> } <span class="ruby-identifier">probs</span> + <span class="ruby-keyword kw">end</span> +</pre> + </div> + </div> + </div> + + <div id="method-M000011" class="method-detail"> + <a name="M000011"></a> + + <div class="method-heading"> + <a href="#M000011" class="method-signature"> + <span class="method-name">vocab_sizes</span><span class="method-args">()</span> + </a> + </div> + + <div class="method-description"> + <p><a class="source-toggle" href="#" + onclick="toggleCode('M000011-source');return false;">[Source]</a></p> + <div class="method-source-code" id="M000011-source"> +<pre> +<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 101</span> + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">vocab_sizes</span> + <span class="ruby-ivar">@vocab_sizes</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_vocabulary_sizes</span> <span class="ruby-keyword kw">end</span> </pre> </div> </div> </div> \ No newline at end of file