Sha256: c7b4549f97a3a67f0da63c7af309e35603f778f27fe843e37c8bdfa3a53829cb

Contents?: true

Size: 1.11 KB

Versions: 3

Compression:

Stored size: 1.11 KB

Contents

# Classifies TextBlocks as content/not-content through rules that have been determined
# using the C4.8 machine learning algorithm, as described in the paper
# "Boilerplate Detection using Shallow Text Features", particularly using text densities and link
# densities.

module Boilerpipe::Filters
  class DensityRulesClassifier

    def self.process(doc)
      #return doc if doc.text_blocks.size < 2

      empty = Boilerpipe::Document::TextBlock.empty_start
      text_blocks = [empty] + doc.text_blocks + [empty]

      text_blocks.each_cons(3) do |slice|
        prev, current, nxt = *slice
        current.content = classify(prev, current, nxt)
      end

      doc
    end

    def self.classify(prev, current, nxt)
      return false if current.link_density > 0.333333

      if prev.link_density <= 0.555556
        if current.text_density <= 9
          return true if nxt.text_density > 10
          return prev.text_density <= 4 ? false : true
        else
          return nxt.text_density == 0 ? false : true
        end
      else
        return false if nxt.text_density <= 11
        true
      end
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.4.0 lib/boilerpipe/filters/density_rules_classifier.rb
boilerpipe-ruby-0.3.0 lib/boilerpipe/filters/density_rules_classifier.rb
boilerpipe-ruby-0.2.0 lib/boilerpipe/filters/density_rules_classifier.rb