Sha256: 19d3b8cb80342478858e50d6e1e381642f3fb12e14367956dbf0f55d393669ff

Contents?: true

Size: 1.43 KB

Versions: 7

Compression:

Stored size: 1.43 KB

Contents

#!/usr/bin/env ruby

# Generate plain-text versions of articles from the tsv-converted raw article data
# (output from extract_articles)
#
# This strips out template tags, wiki links, and so forth
#
# Everything that's left is either actual text, or nicely detached punctuation.

# ## Usage
#
# Uses the output of extract_articles-templated.rb:
#
#    examples/munging/wikipedia/articles/textualize_articles.rb --rm --run \
#      /data/results/wikipedia/full/articles.json.tsv      \
#      /data/results/wikipedia/full/article_texts.json.tsv
#

require 'wukong'
require 'multi_json'
require 'oj'
require 'strscan'
require 'find'
require 'sanitize'
#
require_relative '../utils/munging_utils.rb'
require_relative './wp2txt_article'
require_relative './wp2txt_utils'

module TextualizeArticles

  class Mapper < Wukong::Streamer::RecordStreamer
    include MungingUtils

    @@errors   = 0
    MAX_ERRORS = 1_000

    def process(id, namespace, title, revision_id, timestamp, redirect, raw_text)

      text          = MultiJson.decode(raw_text)
      article       = Wp2txt::Article.new(text, title)
      jsonized_text = MultiJson.encode(article.polish)

      yield [id, namespace, title, revision_id, timestamp, redirect, jsonized_text]

    rescue StandardError => err
      Wukong.bad_record("Bad Record", err, raw_text)
      raise "Too many errors" if (@@errors += 1) > MAX_ERRORS
    end

  end
end

Wukong::Script.new(TextualizeArticles::Mapper, nil).run

Version data entries

7 entries across 7 versions & 2 rubygems

Version Path
ul-wukong-4.1.1 examples/munging/wikipedia/articles/textualize_articles.rb
ul-wukong-4.1.0 examples/munging/wikipedia/articles/textualize_articles.rb
wukong-4.0.0 examples/munging/wikipedia/articles/textualize_articles.rb
wukong-3.0.1 examples/munging/wikipedia/articles/textualize_articles.rb
wukong-3.0.0 examples/munging/wikipedia/articles/textualize_articles.rb
wukong-3.0.0.pre3 examples/munging/wikipedia/articles/textualize_articles.rb
wukong-3.0.0.pre2 examples/munging/wikipedia/articles/textualize_articles.rb