Sha256: f0b00c9e88bec0944bde8bcd6aeca64f9ddc5874cfcbcb9bef15dd630c181bd5

Contents?: true

Size: 1.39 KB

Versions: 2

Compression:

Stored size: 1.39 KB

Contents

module Plagiarism
  module Strategies
    class Engine
      THRESHOLD = 0.8

      attr_accessor :content, :params

      class << self
        def fetch(content, params)
          raise
        end

        def iterate(r, a = :all?)
          raise
        end

        def exists?(response)
          iterate(response) { |uri| uri.host =~ whitelists_regex }
        end

        def valid_segments(ps, params)
          ps.segment.count do |sentence|
            typhoeus = fetch("\"#{sentence}\"", params)
            typhoeus.success? && exists?(typhoeus.response_body)
          end
        end

        def whitelists_regex
          whitelists = Config.whitelists.map { |w| Regexp.new w }
          Regexp.union whitelists
        end
      end

      def initialize(c, p)
        @content, @params = c, p
      end

      def retrieve_link(response)
        raise
      end

      def unique?
        ps = PragmaticSegmenter::Segmenter.new(text: content)
        valid_segments = self.class.valid_segments(ps, params)
        valid_segments / ps.segment.size >= THRESHOLD
      end

      def match
        typhoeus = self.class.fetch("\"#{content}\"", params)
        typhoeus.success? && retrieve_link(typhoeus.response_body)
      end

      def retrieve_link(response)
        self.class.iterate(response, :find) { |uri| uri.host !~ self.class.whitelists_regex and return uri.to_s }
      end

    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
plagiarism2-0.0.8 lib/plagiarism/strategries/engine.rb
plagiarism2-0.0.7 lib/plagiarism/strategries/engine.rb