Sha256: d3bc00faa0216773df3191bf26fceb8973ce368338f0ce04f32cccd4876f4433

Contents?: true

Size: 1.85 KB

Versions: 2

Compression:

Stored size: 1.85 KB

Contents

require 'linguist/classifier'
require 'linguist/language'

module Linguist
  # Model for accessing classifier training data.
  class Sample
    # Samples live in test/ for now, we'll eventually move them out
    PATH = File.expand_path("../../../test/fixtures", __FILE__)

    # Public: Iterate over each Sample.
    #
    # &block - Yields Sample to block
    #
    # Returns nothing.
    def self.each(&block)
      Dir.entries(PATH).each do |category|
        next if category == '.' || category == '..'

        # Skip text and binary for now
        # Possibly reconsider this later
        next if category == 'text' || category == 'binary'

        # Map directory name to a Language alias
        language = Linguist::Language.find_by_alias(category)
        raise "No language for #{category.inspect}" unless language

        dirname = File.join(PATH, category)
        Dir.entries(dirname).each do |filename|
          next if filename == '.' || filename == '..'
          yield new(File.join(dirname, filename), language)
        end
      end

      nil
    end

    # Public: Build Classifier from all samples.
    #
    # Returns trained Classifier.
    def self.classifier
      classifier = Classifier.new
      each { |sample| classifier.train(sample.language, sample.data) }
      classifier.gc
    end

    # Internal: Initialize Sample.
    #
    # Samples should be initialized by Sample.each.
    #
    # path     - String full path to file.
    # language - Language of sample.
    def initialize(path, language)
      @path     = path
      @language = language
    end

    # Public: Get full path to file.
    #
    # Returns String.
    attr_reader :path

    # Public: Get sample language.
    #
    # Returns Language.
    attr_reader :language

    # Public: Read file contents.
    #
    # Returns String.
    def data
      File.read(path)
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
github-linguist-2.0.1 lib/linguist/sample.rb
github-linguist-2.0.0 lib/linguist/sample.rb