Sha256: 40499def4e4bc6740094878b1dd57e712c2a84de54e62c47b3b1126b30a2ac06
Contents?: true
Size: 1.41 KB
Versions: 1
Compression:
Stored size: 1.41 KB
Contents
/* * This script filters the page metadata table, leaving only the pages * in the specified subuniverse. * * Output format (same as page_metadata): * id:int, namespace:int, title:chararray, restrictions:chararray, counter:long, * is_redirect:int, is_new:int, random:float, touched:int, page_latest:int, len:int */ %default PAGE_METADATA '/data/results/wikipedia/full/page_metadata' -- metadata for all pages in the wikipedia corpus %default SUB_NODES '/data/results/wikipedia/mini/nodes' -- all nodes in the subuniverse %default SUB_PAGE_METADATA_OUT '/data/results/wikipedia/mini/page_metadata' -- where output will be stored page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray, restrictions:chararray, counter:long, is_redirect:int, is_new:int, random:float, touched:int, page_latest:int, len:int); sub_nodes = LOAD '$SUB_NODES' AS (node_id:int); sub_page_metadata_unfiltered = JOIN page_metadata BY id, sub_nodes BY node_id; sub_page_metadata = FOREACH sub_page_metadata_unfiltered GENERATE page_metadata::id, page_metadata::namespace, page_metadata::title, page_metadata::restrictions, page_metadata::counter, page_metadata::is_redirect, page_metadata::is_new, page_metadata::random, page_metadata::touched, page_metadata::page_latest, page_metadata::len; STORE sub_page_metadata INTO '$SUB_PAGE_METADATA_OUT';
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
wukong-3.0.0.pre2 | examples/munging/wikipedia/subuniverse/sub_page_metadata.pig |