Sha256: d4f9e5776f64b6d2289f5055cf7a4200d82cbe8ad5543b89582cdf8653a23f75
Contents?: true
Size: 1.09 KB
Versions: 6
Compression:
Stored size: 1.09 KB
Contents
#!/bin/sh formats="json csv isam isamkd marshal" if ! ruby -v | grep -q jruby; then formats="$formats sqlite" fi MKCORPUS="ruby -Ilib ./bin/chbs --trace mkcorpus" CONVERT="ruby -Ilib ./bin/chbs --trace convert" for format in $formats; do echo "Creating corpus files for format $format" eval $MKCORPUS -i wiktionary -o corpus/gutenberg2005.$format data/wiktionary/gutenberg2005/* eval $MKCORPUS -i tvscripts -o corpus/tvscripts.$format data/wiktionary/tv-and-movies/* eval $MKCORPUS -i wordfrequency -o corpus/wordfrequency.$format data/wordfreq/toplemmas/* done echo "Making size100 test corpus in all formats..." eval $MKCORPUS -i tvscripts --randomize --limit 100 -o corpus/size100.json data/wiktionary/tv-and-movies/* for format in $formats; do if [ $format != "json" ]; then eval $CONVERT corpus/size100.json corpus/size100.$format fi done if [ -r "corpus/coca500k.json" ]; then echo "Converting coca500k.json corpus to all formats..." for format in $formats; do if [ $format != "json" ]; then eval $CONVERT corpus/coca500k.json corpus/coca500k.$format fi done fi
Version data entries
6 entries across 6 versions & 1 rubygems