SET mapred.map.tasks.speculative.execution false; -- path to wikipedia pageviews data %default PAGEVIEWS 's3n://bigdata.chimpy.us/data/results/wikipedia/full/pageviews/2008/03' -- the target elasticsearch index and mapping ("type"). Will be created, though you -- should do it yourself first instead as shown below. %default INDEX 'pageviews' %default OBJ 'pagehour' -- path to elasticsearch jars %default ES_JAR_DIR '/usr/local/share/elasticsearch/lib' -- Batch size for loading %default BATCHSIZE '10000' -- Example of bulk loading. This will easily load more than a billion documents -- into a large cluster. We recommend using Ironfan to set your junk up. -- -- Preparation: -- -- Create the index: -- -- curl -XPUT 'http://projectes-elasticsearch-0.test.chimpy.us:9200/pageviews' -d '{"settings": { "index": { -- "number_of_shards": 12, "number_of_replicas": 0, "store.compress": { "stored": true, "tv": true } } }}' -- -- Define the elasticsearch mapping (type): -- -- curl -XPUT 'http://projectes-elasticsearch-0.test.chimpy.us:9200/pageviews/pagehour/_mapping' -d '{ -- "pagehour": { -- "_source": { "enabled" : true }, -- "properties" : { -- "page_id" : { "type": "long", "store": "yes" }, -- "namespace": { "type": "integer", "store": "yes" }, -- "title": { "type": "string", "store": "yes" }, -- "num_visitors": { "type": "long", "store": "yes" }, -- "date": { "type": "integer", "store": "yes" }, -- "time": { "type": "long", "store": "yes" }, -- "ts": { "type": "date", "store": "yes" }, -- "day_of_week": { "type": "integer", "store": "yes" } } }}' -- -- For best results, see the 'Tips for Bulk Loading' in the README. -- -- Always disable speculative execution when loading into a database set mapred.map.tasks.speculative.execution false -- Don't re-use JVM: logging gets angry set mapred.job.reuse.jvm.num.tasks 1 -- Use large file sizes; setup/teardown time for leaving the cluster is worse -- than non-local map tasks set mapred.min.split.size 3000MB set pig.maxCombinedSplitSize 2000MB set pig.splitCombination true register ./target/wonderdog*.jar; register $ES_JAR_DIR/*.jar; pageviews = LOAD '$PAGEVIEWS' AS ( page_id:long, namespace:int, title:chararray, num_visitors:long, date:int, time:long, epoch_time:long, day_of_week:int); pageviews_fixed = FOREACH pageviews GENERATE page_id, namespace, title, num_visitors, date, time, epoch_time * 1000L AS ts, day_of_week; STORE pageviews_fixed INTO 'es://$INDEX/$OBJ?json=false&size=$BATCHSIZE' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage(); -- -- To instead dump the JSON data to disk (needs Pig 0.10+) -- set dfs.replication 2 -- %default OUTDUMP '$PAGEVIEWS.json' -- rmf $OUTDUMP -- STORE pageviews_fixed INTO '$OUTDUMP' USING JsonStorage();