Sha256: d7260b0597c3d70cbb1c54ab4234f2f1c06debe3e7badbe0e70bc8bddfd6b38e
Contents?: true
Size: 1.11 KB
Versions: 5
Compression:
Stored size: 1.11 KB
Contents
# Classifies TextBlocks as content/not-content through rules that have been determined # using the C4.8 machine learning algorithm, as described in the paper # "Boilerplate Detection using Shallow Text Features", particularly using text densities and link # densities. module Boilerpipe::Filters class DensityRulesClassifier def self.process(doc) # return doc if doc.text_blocks.size < 2 empty = Boilerpipe::Document::TextBlock.empty_start text_blocks = [empty] + doc.text_blocks + [empty] text_blocks.each_cons(3) do |slice| prev, current, nxt = *slice current.content = classify(prev, current, nxt) end doc end def self.classify(prev, current, nxt) return false if current.link_density > 0.333333 if prev.link_density <= 0.555556 if current.text_density <= 9 return true if nxt.text_density > 10 return prev.text_density <= 4 ? false : true else return nxt.text_density == 0 ? false : true end else return false if nxt.text_density <= 11 true end end end end
Version data entries
5 entries across 5 versions & 1 rubygems