docs/classes/Ankusa/Classifier.html in ankusa-0.0.5 vs docs/classes/Ankusa/Classifier.html in ankusa-0.0.6
- old
+ new
@@ -86,14 +86,17 @@
<h3 class="section-bar">Methods</h3>
<div class="name-list">
<a href="#M000007">classifications</a>
<a href="#M000006">classify</a>
- <a href="#M000008">get_word_probs</a>
+ <a href="#M000010">doc_count_totals</a>
+ <a href="#M000009">get_word_probs</a>
+ <a href="#M000008">log_likelihoods</a>
<a href="#M000003">new</a>
<a href="#M000004">train</a>
<a href="#M000005">untrain</a>
+ <a href="#M000011">vocab_sizes</a>
</div>
</div>
</div>
@@ -156,37 +159,32 @@
<div id="method-M000007" class="method-detail">
<a name="M000007"></a>
<div class="method-heading">
<a href="#M000007" class="method-signature">
- <span class="method-name">classifications</span><span class="method-args">(text)</span>
+ <span class="method-name">classifications</span><span class="method-args">(text, classnames=nil)</span>
</a>
</div>
<div class="method-description">
+ <p>
+Classes is an array of classes to look at
+</p>
<p><a class="source-toggle" href="#"
onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
<div class="method-source-code" id="M000007-source">
<pre>
-<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 46</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>)
- <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
-
- <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
- <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
+<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 53</span>
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
+ <span class="ruby-identifier">result</span> = <span class="ruby-identifier">log_likelihoods</span> <span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>]
}
- <span class="ruby-comment cmt"># add the prior and exponentiate</span>
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
- <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_total</span>.<span class="ruby-identifier">to_f</span>)
- <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-constant">Math</span>.<span class="ruby-identifier">exp</span>(<span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>])
- }
-
<span class="ruby-comment cmt"># normalize to get probs</span>
<span class="ruby-identifier">sum</span> = <span class="ruby-identifier">result</span>.<span class="ruby-identifier">values</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> }
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
+ <span class="ruby-identifier">result</span>.<span class="ruby-identifier">keys</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">/</span> <span class="ruby-identifier">sum</span> }
<span class="ruby-identifier">result</span>
<span class="ruby-keyword kw">end</span>
</pre>
</div>
</div>
@@ -195,29 +193,70 @@
<div id="method-M000006" class="method-detail">
<a name="M000006"></a>
<div class="method-heading">
<a href="#M000006" class="method-signature">
- <span class="method-name">classify</span><span class="method-args">(text)</span>
+ <span class="method-name">classify</span><span class="method-args">(text, classes=nil)</span>
</a>
</div>
<div class="method-description">
<p><a class="source-toggle" href="#"
onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
<div class="method-source-code" id="M000006-source">
<pre>
-<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 41</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>)
+<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 47</span>
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">classify</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>=<span class="ruby-keyword kw">nil</span>)
<span class="ruby-comment cmt"># return the most probable class</span>
- <span class="ruby-identifier">classifications</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
+ <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classes</span>).<span class="ruby-identifier">sort_by</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">c</span><span class="ruby-operator">|</span> <span class="ruby-operator">-</span><span class="ruby-identifier">c</span>[<span class="ruby-value">1</span>] }.<span class="ruby-identifier">first</span>.<span class="ruby-identifier">first</span>
<span class="ruby-keyword kw">end</span>
</pre>
</div>
</div>
</div>
+ <div id="method-M000008" class="method-detail">
+ <a name="M000008"></a>
+
+ <div class="method-heading">
+ <a href="#M000008" class="method-signature">
+ <span class="method-name">log_likelihoods</span><span class="method-args">(text, classnames=nil)</span>
+ </a>
+ </div>
+
+ <div class="method-description">
+ <p>
+Classes is an array of classes to look at
+</p>
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000008-source">
+<pre>
+<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 66</span>
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">log_likelihoods</span>(<span class="ruby-identifier">text</span>, <span class="ruby-identifier">classnames</span>=<span class="ruby-keyword kw">nil</span>)
+ <span class="ruby-identifier">classnames</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@classnames</span>
+ <span class="ruby-identifier">result</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
+
+ <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
+ <span class="ruby-identifier">probs</span> = <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> (<span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>(<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>]) <span class="ruby-operator">*</span> <span class="ruby-identifier">count</span>) }
+ }
+
+ <span class="ruby-comment cmt"># add the prior and exponentiate</span>
+ <span class="ruby-identifier">doc_counts</span> = <span class="ruby-identifier">doc_count_totals</span>.<span class="ruby-identifier">select</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }.<span class="ruby-identifier">map</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">v</span> }
+ <span class="ruby-identifier">doc_count_total</span> = (<span class="ruby-identifier">doc_counts</span>.<span class="ruby-identifier">inject</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">x</span>,<span class="ruby-identifier">y</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span><span class="ruby-operator">+</span><span class="ruby-identifier">y</span> } <span class="ruby-operator">+</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">length</span>).<span class="ruby-identifier">to_f</span>
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span><span class="ruby-operator">|</span>
+ <span class="ruby-identifier">result</span>[<span class="ruby-identifier">k</span>] <span class="ruby-operator">+=</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>((<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_doc_count</span>(<span class="ruby-identifier">k</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">doc_count_total</span>)
+ }
+
+ <span class="ruby-identifier">result</span>
+ <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+ </div>
+ </div>
+
<div id="method-M000004" class="method-detail">
<a name="M000004"></a>
<div class="method-heading">
<a href="#M000004" class="method-signature">
@@ -242,10 +281,13 @@
}
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">doccount</span>
<span class="ruby-ivar">@classnames</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">klass</span> <span class="ruby-keyword kw">if</span> <span class="ruby-keyword kw">not</span> <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">klass</span>
+ <span class="ruby-comment cmt"># cache is now dirty of these vars</span>
+ <span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span>
+ <span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span>
<span class="ruby-identifier">th</span>
<span class="ruby-keyword kw">end</span>
</pre>
</div>
</div>
@@ -266,50 +308,101 @@
</p>
<p><a class="source-toggle" href="#"
onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
<div class="method-source-code" id="M000005-source">
<pre>
-<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 29</span>
+<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 32</span>
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">untrain</span>(<span class="ruby-identifier">klass</span>, <span class="ruby-identifier">text</span>)
<span class="ruby-identifier">th</span> = <span class="ruby-constant">TextHash</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">text</span>)
<span class="ruby-identifier">th</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span><span class="ruby-operator">|</span>
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-identifier">word</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">count</span>
<span class="ruby-keyword kw">yield</span> <span class="ruby-identifier">word</span>, <span class="ruby-identifier">count</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">block_given?</span>
}
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_total_word_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">th</span>.<span class="ruby-identifier">word_count</span>
<span class="ruby-identifier">doccount</span> = (<span class="ruby-identifier">text</span>.<span class="ruby-identifier">kind_of?</span> <span class="ruby-constant">Array</span>) <span class="ruby-operator">?</span> <span class="ruby-identifier">text</span>.<span class="ruby-identifier">length</span> <span class="ruby-operator">:</span> <span class="ruby-value">1</span>
<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">incr_doc_count</span> <span class="ruby-identifier">klass</span>, <span class="ruby-operator">-</span><span class="ruby-identifier">doccount</span>
+ <span class="ruby-comment cmt"># cache is now dirty of these vars</span>
+ <span class="ruby-ivar">@doc_count_totals</span> = <span class="ruby-keyword kw">nil</span>
+ <span class="ruby-ivar">@vocab_sizes</span> = <span class="ruby-keyword kw">nil</span>
<span class="ruby-identifier">th</span>
<span class="ruby-keyword kw">end</span>
</pre>
</div>
</div>
</div>
<h3 class="section-bar">Protected Instance methods</h3>
- <div id="method-M000008" class="method-detail">
- <a name="M000008"></a>
+ <div id="method-M000010" class="method-detail">
+ <a name="M000010"></a>
<div class="method-heading">
- <a href="#M000008" class="method-signature">
- <span class="method-name">get_word_probs</span><span class="method-args">(word)</span>
+ <a href="#M000010" class="method-signature">
+ <span class="method-name">doc_count_totals</span><span class="method-args">()</span>
</a>
</div>
<div class="method-description">
<p><a class="source-toggle" href="#"
- onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000008-source">
+ onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000010-source">
<pre>
-<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 67</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>)
- <span class="ruby-identifier">probs</span> = <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>)
- <span class="ruby-ivar">@classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
+<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 97</span>
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">doc_count_totals</span>
+ <span class="ruby-ivar">@doc_count_totals</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">doc_count_totals</span>
+ <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+ </div>
+ </div>
+
+ <div id="method-M000009" class="method-detail">
+ <a name="M000009"></a>
+
+ <div class="method-heading">
+ <a href="#M000009" class="method-signature">
+ <span class="method-name">get_word_probs</span><span class="method-args">(word, classnames)</span>
+ </a>
+ </div>
+
+ <div class="method-description">
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000009-source">
+<pre>
+<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 86</span>
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">get_word_probs</span>(<span class="ruby-identifier">word</span>, <span class="ruby-identifier">classnames</span>)
+ <span class="ruby-identifier">probs</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> <span class="ruby-value">0</span>
+ <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_word_counts</span>(<span class="ruby-identifier">word</span>).<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">k</span>,<span class="ruby-identifier">v</span><span class="ruby-operator">|</span> <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-identifier">v</span> <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">include?</span> <span class="ruby-identifier">k</span> }
+ <span class="ruby-identifier">vs</span> = <span class="ruby-identifier">vocab_sizes</span>
+ <span class="ruby-identifier">classnames</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">cn</span><span class="ruby-operator">|</span>
<span class="ruby-comment cmt"># use a laplacian smoother</span>
- <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span>
+ <span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] = (<span class="ruby-identifier">probs</span>[<span class="ruby-identifier">cn</span>] <span class="ruby-operator">+</span> <span class="ruby-value">1</span>).<span class="ruby-identifier">to_f</span> <span class="ruby-operator">/</span> (<span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_total_word_count</span>(<span class="ruby-identifier">cn</span>) <span class="ruby-operator">+</span> <span class="ruby-identifier">vs</span>[<span class="ruby-identifier">cn</span>]).<span class="ruby-identifier">to_f</span>
}
<span class="ruby-identifier">probs</span>
+ <span class="ruby-keyword kw">end</span>
+</pre>
+ </div>
+ </div>
+ </div>
+
+ <div id="method-M000011" class="method-detail">
+ <a name="M000011"></a>
+
+ <div class="method-heading">
+ <a href="#M000011" class="method-signature">
+ <span class="method-name">vocab_sizes</span><span class="method-args">()</span>
+ </a>
+ </div>
+
+ <div class="method-description">
+ <p><a class="source-toggle" href="#"
+ onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
+ <div class="method-source-code" id="M000011-source">
+<pre>
+<span class="ruby-comment cmt"># File lib/ankusa/classifier.rb, line 101</span>
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">vocab_sizes</span>
+ <span class="ruby-ivar">@vocab_sizes</span> <span class="ruby-operator">||=</span> <span class="ruby-ivar">@storage</span>.<span class="ruby-identifier">get_vocabulary_sizes</span>
<span class="ruby-keyword kw">end</span>
</pre>
</div>
</div>
</div>
\ No newline at end of file