Sha256: d54e8660cdce4b8d44ebf33747a6ef423993ef14e4d4d0fe6f148532f805e565
Contents?: true
Size: 1.2 KB
Versions: 1
Compression:
Stored size: 1.2 KB
Contents
/* * This script filters the articles table, leaving only the articles * in the specified subuniverse. * * Output format: * page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int, * rev_epoch_time:long, rev_dow:int, article_text:chararray */ %default ARTICLES '/data/results/wikipedia/full/articles' -- all articles in the wikipedia corpus %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse %default SUB_ARTICLES_OUT '/data/results/wikipedia/mini/articles' -- where output will be stored articles = LOAD '$ARTICLES' AS (page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int, rev_epoch_time:long, rev_dow:int, article_text:chararray); sub_nodes = LOAD '$SUB_NODES' AS (node_id:int); sub_articles_unfiltered = JOIN articles BY id, sub_nodes BY node_id; sub_articles = FOREACH sub_articles_unfiltered GENERATE articles::page_id AS page_id, articles::title AS title, articles::namespace AS namespace, articles::rev_date AS rev_date, articles::rev_time AS rev_time, articles::rev_epoch_time AS rev_epoch_time, articles::rev_dow AS rev_dow, articles::article_text AS article_text; STORE sub_articles INTO '$SUB_ARTICLES_OUT';
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
wukong-3.0.0.pre2 | examples/munging/wikipedia/subuniverse/sub_articles.pig |