/* * This script filters the pageviews table, leaving only the pageviews * in the specified subuniverse. * * Parameters: * pageviews - all pageviews in the wikipedia corpus * sub_nodes - the list of nodes in your subuniverse * sub_pageviews_out - the directory where output will be stored * * Output format (same as pageviews_augment.pig): * id:int, namespace:int, * page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int, * rev_epoch_time:long, rev_dow:int, article_text:chararray */ %default PAGEVIEWS '/data/results/wikipedia/full/pageviews' -- all pageview stats for the English Wikipedia %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse %default SUB_PAGEVIEWS_OUT '/data/results/wikipedia/mini/pageviews' -- where output will be stored pageviews = LOAD '$PAGEVIEWS' AS (page_id:int, title:chararray, namespace:int, rev_date:int, rev_time:int, rev_epoch_time:long, rev_dow:int, article_text:chararray); sub_nodes = LOAD '$SUB_NODES' AS (node_id:int); sub_pageviews_unfiltered = JOIN pageviews BY id, sub_nodes BY node_id; sub_pageviews = FOREACH sub_pageviews_unfiltered GENERATE articles::page_id AS page_id, articles::title AS title, articles::namespace AS namespace, articles::rev_date AS rev_date, articles::rev_time AS rev_time, articles::rev_epoch_time AS rev_epoch_time, articles::rev_dow AS rev_dow, articles::article_text AS article_text; STORE sub_pageviews INTO '$SUB_PAGEVIEWS_OUT';