Sha256: 1f8adf7dbe2f7f7fd93e78c921b47a1be4eaeee45c450c9579681f587aab3ab1

Contents?: true

Size: 1.87 KB

Versions: 6

Compression:

Stored size: 1.87 KB

Contents

class CorrectHorseBatteryStaple::Parser
  class Regex < Base
    PARSERS  = {
        :wiktionary   =>  [%r{<a href="/wiki/\w+" title="(\w+)">\w+</a> = (\d+)},
          lambda {|match| CorrectHorseBatteryStaple::Word.new(:word => match[0], :frequency => match[1].to_i) }],

        # rank	lemma	PoS	freq	dispersion
        # 7	to	t	6332195	0.98
        :wordfrequency => [ %r{^(\d+)\s+(\w+)\s+\w*\s+(\d+)\s+([0-9.]+)},
        lambda {|match| CorrectHorseBatteryStaple::Word.new(:word => match[1],
            :rank => match[0].to_f,
            :frequency => match[3].to_f,
            :dispersion => match[4].to_f)
        }],

        # using tabs between columns
        # freq    word    PoS     # texts
        # -----   -----   -----   -----
        # 22995878        the     at      169011
        # 11239776        and     cc      168844
        :coca => [ %r{^(\d+)\s+(\w+)\s+\w*\s+(\d+)},
        lambda {|match| CorrectHorseBatteryStaple::Word.new(:word => match[1],
            :frequency => match[0].to_i,
            :texts => match[2].to_i)
        }],

      # <tr>
      # <td>25</td>
      # <td><a href="/wiki/be" title="be">be</a></td>
      # <td>191823</td>
      # </tr>
      :tvscripts => [
        Regexp.new('<tr>.*?<td>(\d+)</td>.*?<td>.*?title="(\w+)".*?</td>.*?<td>(\d+)</td>.*?</tr>', Regexp::MULTILINE),
        lambda {|match| CorrectHorseBatteryStaple::Word.new(
            :rank => match[0].to_i,
            :word => match[1],
            :frequency => match[2].to_i
            ) }
      ]
    }

    def initialize(type = :wiktionary)
      @parser_type = type.to_sym
    end

    def parse(file)
      raise ArgumentError, "unknown regex parser type #{@parser_type}" unless PARSERS.has_key?(@parser_type)
      (regex, lexer) = PARSERS[@parser_type]

      words = 
        file.read.scan(regex).map do |match|
          lexer.call(match)
        end

    end
  end
end

Version data entries

6 entries across 6 versions & 1 rubygems

Version Path
correct-horse-battery-staple-0.6.6 lib/correct_horse_battery_staple/parser/regex.rb
correct-horse-battery-staple-0.6.5 lib/correct_horse_battery_staple/parser/regex.rb
correct-horse-battery-staple-0.6.4 lib/correct_horse_battery_staple/parser/regex.rb
correct-horse-battery-staple-0.6.3 lib/correct_horse_battery_staple/parser/regex.rb
correct-horse-battery-staple-0.6.2 lib/correct_horse_battery_staple/parser/regex.rb
correct-horse-battery-staple-0.6.1 lib/correct_horse_battery_staple/parser/regex.rb