Sha256: d9ab49f4afad47ae621d630659f351be7ce2637ee948326b2d88917cf4834e6c

Contents?: true

Size: 912 Bytes

Versions: 1

Compression:

Stored size: 912 Bytes

Contents

#!/bin/usr/env sh
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

set -e

URL=$1

FILENAME=$(basename --suffix=".warc.wet.gz" "${URL}")

echo "Processing ${FILENAME}."

wget -q -P tmp "${URL}"

#echo "Extracting ${FILENAME}.warc.wet.gz"
gunzip "tmp/${FILENAME}.warc.wet.gz"

#echo "Language identification for ${FILENAME}.warc.wet"
fastText/fasttext predict-prob lid.176.bin "tmp/${FILENAME}.warc.wet" > "tmp/${FILENAME}.lid"

#echo "Splitting ${FILENAME}.warc.wet per language"
paste "tmp/${FILENAME}.lid" "tmp/${FILENAME}.warc.wet" | \
    awk '($2 > 0.8 || ($1=="__label__hr" && $2 > 0.4)) && length() > 100 {lang = substr($1, 10); $1=""; $2=""; print $0 >> "shard/"lang".txt"}'

#echo "Removing tmp files"
rm "tmp/${FILENAME}.lid"
rm "tmp/${FILENAME}.warc.wet"

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
fasttext-0.1.0 vendor/fastText/crawl/process_wet_file.sh