Sha256: d54e8660cdce4b8d44ebf33747a6ef423993ef14e4d4d0fe6f148532f805e565

Contents?: true

Size: 1.2 KB

Versions: 1

Compression:

Stored size: 1.2 KB

Contents

/*
 * This script filters the articles table, leaving only the articles
 * in the specified subuniverse.
 *
 * Output format:
 * page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int, 
 * rev_epoch_time:long, rev_dow:int, article_text:chararray
 */

%default ARTICLES         '/data/results/wikipedia/full/articles' -- all articles in the wikipedia corpus
%default SUB_NODES        '/data/results/wikipedia/mini/nodes'    -- all nodes in the subuniverse
%default SUB_ARTICLES_OUT '/data/results/wikipedia/mini/articles' -- where output will be stored

articles = LOAD '$ARTICLES' AS (page_id:int, title:chararray, namespace:int, 
  rev_date:int, rev_time:int, rev_epoch_time:long, rev_dow:int, article_text:chararray);
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);
sub_articles_unfiltered = JOIN articles BY id, sub_nodes BY node_id;
sub_articles = FOREACH sub_articles_unfiltered GENERATE
  articles::page_id AS page_id, articles::title AS title, articles::namespace AS namespace,
  articles::rev_date AS rev_date, articles::rev_time AS rev_time,
  articles::rev_epoch_time AS rev_epoch_time, articles::rev_dow AS rev_dow,
  articles::article_text AS article_text;
STORE sub_articles INTO '$SUB_ARTICLES_OUT';

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wukong-3.0.0.pre2 examples/munging/wikipedia/subuniverse/sub_articles.pig