Sha256: 1d19c2b10828c8c52807a87bc6aea100422735847923c70b8574769cccd4a005

Contents?: true

Size: 1.38 KB

Versions: 10

Compression:

Stored size: 1.38 KB

Contents

# Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
# corpus. 

# An S3 bucket is created by the init command and is used to store data and logs.
s3_bucket_name: 'BUCKET_NAME'

# A parse step is created per Common Crawl segment.  A combine step takes the
# results from multiple segments to create a single set of output files.

# The parse input filter is used to specify the Common Crawl file type.

# WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
# WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
# WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.

# The EMR config is an XML file that sets Hadoop properties.  If a config file
# is specified then a bootstrap action is run on each node to apply it.
steps:
  # Parse step for the Example Elasticrawl JAR.  This does a word count
  # against the text extractions of the corpus.
  parse:
    jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
    class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
    input_filter: 'wet/*.warc.wet.gz'
    emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
  # Combine step for the Example Elasticrawl JAR.
  combine:
    jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
    class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
    input_filter: 'part-*'
    emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'

Version data entries

10 entries across 10 versions & 1 rubygems

Version Path
elasticrawl-1.1.8 templates/jobs.yml
elasticrawl-1.1.7 templates/jobs.yml
elasticrawl-1.1.6 templates/jobs.yml
elasticrawl-1.1.5 templates/jobs.yml
elasticrawl-1.1.4 templates/jobs.yml
elasticrawl-1.1.3 templates/jobs.yml
elasticrawl-1.1.2 templates/jobs.yml
elasticrawl-1.1.1 templates/jobs.yml
elasticrawl-1.1.0 templates/jobs.yml
elasticrawl-1.0.0 templates/jobs.yml