Sha256: 3b5cbd303dd304754652b745513fb24c01770fdb4e4c5114208cf4c233b58b86

Contents?: true

Size: 1.28 KB

Versions: 4

Compression:

Stored size: 1.28 KB

Contents

require 'sinatra'
require 'uri'
require 'mongo'
require 'date'
require 'time'
require File.expand_path('../../src/language-detector', __FILE__)

use Rack::MethodOverride

# Tweets come from a MongoDB collection.
uri = URI.parse(ENV['MONGOHQ_URL'])
conn = Mongo::Connection.from_uri(ENV['MONGOHQ_URL'])
db = conn.db(uri.path.gsub(/^\//, ''))
coll = db["tweets"]

DETECTOR = LanguageDetector.load_yaml("detector2.yaml")

helpers do
  def partial(page, locals = {})
    haml page, :layout => false, :locals => locals
  end  
end

layout 'layout'
  
get '/' do  
  haml :index
end

post '/' do
  @sentence = nil
  if params[:sentence]
    @sentence = params[:sentence]
    @language = DETECTOR.classify(@sentence) == "majority" ? "English" : "Not English"
  end
  
  haml :index
end

get '/tweet' do
  @tweet = coll.find().limit(-1).skip(rand(coll.count())).first()['text']
  @language = DETECTOR.classify(@tweet) == "majority" ? "English" : "Not English"
  @language = "Not English" if @tweet.split.select{ |c| c =~ /[^\x00-\x80]/ }.size > 1 # Use this if you want to check for non-Roman characters. Not necessary, but sometimes there are tweets consisting solely of non-Roman characters, in which case the classifier fails (since it currently removes all non-ASCII characters).

  haml :tweet, :layout => false
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
unsupervised-language-detection-0.0.4 website/main.rb
unsupervised-language-detection-0.0.3 website/main.rb
unsupervised-language-detection-0.0.2 website/main.rb
unsupervised-language-detection-0.0.1 website/main.rb