Sha256: a5b13dc7a156b965d4f3bf733b62148112418bb28cc4d85fdc6ba7021cf73199

Contents?: true

Size: 1.67 KB

Versions: 1

Compression:

Stored size: 1.67 KB

Contents

/*
 A script to generate Wikipedia page graph edge list
 Accepts as input 2 tsvs: list of pages and list of links
 Link table should initially be formatted as from_page_id, into_namespace, into_title
 Assumes that the combination of namespace and title uniquely identifies a page
 
 Output Format:
 from_id:int, into_id:int, from_namespace:int, from_title:chararray,  into_namespace:int, into_title:chararray
*/

%default PAGE_METADATA           '/data/results/wikipedia/full/page_metadata' -- page metadata for all Wikipedia pages
%default EXTRACTED_PAGELINKS     '/data/scratch/wikipedia/full/pagelinks'     -- raw extracted pagelinks
%default AUGMENTED_PAGELINKS_OUT '/data/results/wikipedia/full/pagelinks'     -- augmented pagelinks

page_metadata = LOAD '$PAGE_METADATA' AS (id:int, namespace:int, title:chararray, 
  restrictions:chararray, counter:long, is_redirect:int, is_new:int, 
  random:float, touched:int, page_latest:int, len:int);
links = LOAD '$EXTRACTED_PAGELINKS' AS (from_id:int, into_namespace:int, into_title:chararray);

first_pass_j = JOIN page_metadata BY id RIGHT OUTER, links BY from_id;
first_pass   = FOREACH first_pass_j GENERATE 
  links::from_id AS from_id, page_metadata::namespace AS from_namespace, page_metadata::title AS from_title,
  links::into_namespace AS into_namespace, links::into_title AS into_title;
second_pass_j = JOIN page_metadata BY (namespace, title) RIGHT OUTER, first_pass BY (into_namespace, into_title);
second_pass   = FOREACH second_pass_j GENERATE 
  first_pass::from_id, page_metadata::id,
  first_pass::from_namespace, first_pass::from_title, 
  first_pass::into_namespace, first_pass::into_title;
STORE second_pass INTO '$AUGMENTED_PAGELINKS_OUT';

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wukong-3.0.0.pre2 examples/munging/wikipedia/pagelinks/augment_pagelinks.pig