Sha256: 93cdc2d88e090b9b39a255221ea389730faa3794612d884cab63e920388aece9

Contents?: true

Size: 1.75 KB

Versions: 2

Compression:

Stored size: 1.75 KB

Contents

# encoding: utf-8

require 'fast-stemmer'
require 'csv'

# logic ported from https://plugins.trac.wordpress.org/browser/wordpress-23-related-posts-plugin/trunk/recommendations.php

class Middleman::Blog::Similar::Algorithm::WordFrequency < ::Middleman::Blog::Similar::Algorithm
  @@unigrams = nil
  class << self
    def unigrams_path
      File.join File.dirname(__FILE__), 'unigrams.csv'
    end
    def unigrams
      if @@unigrams.nil?
        @@unigrams = {}
        CSV.foreach(unigrams_path, { :col_sep => "\t" }) do|row|
          @@unigrams[row[0]] = row[4].to_f if row.length == 5
        end
      end
      @@unigrams
    end
  end
  def distance a
    d = 0xffffff
    wf = a.similarity_algorithm.word_freq
    word_freq.each do|word, freq|
      if wf.has_key? word
        d -= wf[word] * freq
      end
    end
    d
  end
  def words
    re = /[\t\s\n,\.、。 ]/
    article.untagged_body.split(re) + article.title.split(re)
  end
  def generate_word_freq
    suitable_words = unigrams.dup
    word_freq= {}
    words.each do|word|
      word.downcase!
      word = word.stem if word =~ /^\w+$/
      word_freq[word] ||= 0
      word_freq[word] += 1
    end
    selected_words = {}
    word_freq.each do|word, freq|
      selected_words[word] = unigrams[word] * Math.sqrt(freq) if unigrams[word]
    end
    article.tags.each do|tag|
      tag = tag.downcase.stem
      word_freq[tag] ||= 0
      word_freq[tag] += tag_weight
    end
    Hash[ word_freq.sort_by{|k, v| k }.sort_by{|k, v| v } ]
  end
  def word_freq
    @word_freq ||= generate_word_freq
  end
  def generate_tags
    generate_word_freq.keys.reverse
  end
  def tags
    @tags ||= generate_tags
  end
  def tag_weight   ; 5                      ; end
  def unigrams     ; self.class.unigrams    ; end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
middleman-blog-similar-1.1.1 lib/middleman-blog-similar/algorithm/word_frequency.rb
middleman-blog-similar-1.1.0 lib/middleman-blog-similar/algorithm/word_frequency.rb