Sha256: 6504df8fcdaa36a292bc4f651fb4cac812ac9c09d6160cd462b7a0ddfd9d32c6

Contents?: true

Size: 1.26 KB

Versions: 68

Compression:

Stored size: 1.26 KB

Contents

require "fileutils"

LANGUAGE_MAP = {:danish     => :da,
                :dutch      => :nl,
                :english    => :en,
                :finnish    => :fi,
                :french     => :fr,
                :german     => :de,
                :hungarian  => :hu,
                :italian    => :it,
                :norwegian  => :no,
                :portuguese => :pt,
                :russian    => :ru,
                :spanish    => :es,
                :swedish    => :sv}

# 1. Load the stop words files from snowball.tartarus.org
LANGUAGE_MAP.keys.reject{|k| k == :russian}.each { |l| system("curl http://snowball.tartarus.org/algorithms/%s/stop.txt | iconv -f ISO-8859-1 -t UTF-8 > %s.txt" % [l, l]) }
system("curl http://snowball.tartarus.org/algorithms/russian/stop.txt | iconv -f KOI8-R -t UTF-8 > russian.txt")

# 2. Clean up the files (remove comments) and write a new file with the iso name
LANGUAGE_MAP.keys.each do |lang|
  open("#{LANGUAGE_MAP[lang]}.txt", "w") do |outfile|
    open("#{lang}.txt", "r") do |infile|
      while line = infile.gets
        outfile.puts line.split(" ", 2).first.downcase.strip  unless line =~ /^ +|^$|^\|/
      end
    end
  end
end

# 3. Remove the downloaded files
LANGUAGE_MAP.keys.each {|lang| FileUtils.rm_rf "#{lang}.txt"}


Version data entries

68 entries across 68 versions & 1 rubygems

Version Path
xapian_db-1.3.15 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.14 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.13 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.12 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.11 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.10 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.9 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.8 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.7.4 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.7.3 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.7.2 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.7.1 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.7 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.5.4 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.5.3 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.5.2 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.5.1 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.5 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.4 lib/xapian_db/stopwords/update_stopwords.rb
xapian_db-1.3.3.1 lib/xapian_db/stopwords/update_stopwords.rb