Sha256: 6504df8fcdaa36a292bc4f651fb4cac812ac9c09d6160cd462b7a0ddfd9d32c6

Contents?: true

Size: 1.26 KB

Versions: 68

Compression:

Stored size: 1.26 KB

Contents

require "fileutils"

LANGUAGE_MAP = {:danish     => :da,
                :dutch      => :nl,
                :english    => :en,
                :finnish    => :fi,
                :french     => :fr,
                :german     => :de,
                :hungarian  => :hu,
                :italian    => :it,
                :norwegian  => :no,
                :portuguese => :pt,
                :russian    => :ru,
                :spanish    => :es,
                :swedish    => :sv}

# 1. Load the stop words files from snowball.tartarus.org
LANGUAGE_MAP.keys.reject{|k| k == :russian}.each { |l| system("curl http://snowball.tartarus.org/algorithms/%s/stop.txt | iconv -f ISO-8859-1 -t UTF-8 > %s.txt" % [l, l]) }
system("curl http://snowball.tartarus.org/algorithms/russian/stop.txt | iconv -f KOI8-R -t UTF-8 > russian.txt")

# 2. Clean up the files (remove comments) and write a new file with the iso name
LANGUAGE_MAP.keys.each do |lang|
  open("#{LANGUAGE_MAP[lang]}.txt", "w") do |outfile|
    open("#{lang}.txt", "r") do |infile|
      while line = infile.gets
        outfile.puts line.split(" ", 2).first.downcase.strip  unless line =~ /^ +|^$|^\|/
      end
    end
  end
end

# 3. Remove the downloaded files
LANGUAGE_MAP.keys.each {|lang| FileUtils.rm_rf "#{lang}.txt"}


Version data entries

68 entries across 68 versions & 1 rubygems

Version Path
xapian_db-0.5.2 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-0.5.1 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-0.5.0 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-0.4.2 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-0.4.1 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-0.4.0 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-0.3.4 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-0.3.3 lib/xapian_db/stopwords/update_stopwords.rb