Sha256: cc6278811731df98f55ef08cb4f223d4fed03ebe85bf7457554b79a7b9b4c4f8

Contents?: true

Size: 1.34 KB

Versions: 1

Compression:

Stored size: 1.34 KB

Contents

/*
 * This script filters the pagelinks table, leaving only the pagelinks
 * that start and end within supplied subuniverse.
 * 
 * Output format (same as augment_pagelinks):
 * from_id:int, into_id:int, from_namespace:int, from_title:chararray,  into_namespace:int, into_title:chararray
 */

%default PAGELINKS         '/data/results/wikipedia/full/undirected_pagelinks' -- all edges in the pagelink graph
%default SUB_NODES         '/data/results/wikipedia/mini/nodes'                -- all nodes in the subuniverse
%default SUB_PAGELINKS_OUT '/data/results/wikipedia/mini/pagelinks'            -- where output will be stored

all_pagelinks = LOAD '$PAGELINKS' AS (from_id:int, into_id:int, 
  from_namespace:int, from_title:chararray,  into_namespace:int, into_title:chararray);
sub_nodes = LOAD '$SUB_NODES' AS (node_id:int);

sub_pagelinks_in = JOIN all_pagelinks BY from_id, sub_nodes BY node_id;
sub_pagelinks_unfiltered = JOIN sub_pagelinks_in BY into_id, sub_nodes BY node_id;
sub_pagelinks = FOREACH sub_pagelinks_unfiltered GENERATE
  sub_pagelinks_in::all_pagelinks::from_id,
  sub_pagelinks_in::all_pagelinks::into_id,
  sub_pagelinks_in::all_pagelinks::from_namespace,
  sub_pagelinks_in::all_pagelinks::from_title,
  sub_pagelinks_in::all_pagelinks::into_namespace,
  sub_pagelinks_in::all_pagelinks::into_title;
STORE sub_pagelinks INTO '$SUB_PAGELINKS_OUT';

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wukong-3.0.0.pre2 examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig