#!/usr/bin/env bash

# Sort two big illumina files corresponding to paired-end experiment and then join common sequences on different files. Sequences not in common goes to a separate file.

# cat $1 | awk '{split($0, a, " "); n++; if (n%1==0){printf("%s\t",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }'
# 
# exit

if [ "$#" < 4 ]; 
then
    echo ""
	echo "Use: $0 file1.fastq file2.fastq base_output_name tmp_dir"
    echo ""
	exit
fi

base_name=$3

if [[ -z "$base_name" ]]; then
    echo "Use a : $base_name doesn't exists"
    exit -1
fi

tmp_dir=$4

if [[ -z "$4" ]]; then
    tmp_dir=`pwd`
fi

if [[ ! -e "$tmp_dir" ]]; then
    echo "Tmp dir: $4 doesn't exists"
    exit -1
fi

echo "Using TMPDIR $tmp_dir"

f1_path=$1
f2_path=$2

f1_name=`basename $1`
f2_name=`basename $2`

f1_tmp="$tmp_dir/${f1_name}"
f2_tmp="$tmp_dir/${f2_name}"

common_names="$tmp_dir/comm.names"

only_in_1="$tmp_dir/only_in_1.txt"
only_in_2="$tmp_dir/only_in_2.txt"
in_both="$tmp_dir/in_both.txt"


echo "Starting sorting"

if [[ ! -e "$f1_tmp.sorted" ]]; then
    echo "Sorting $f1_name"
    cat $f1_path | awk '{split($0, a, " "); sub(/\/1$/,"\t", a[1]); n++; if (n%4==1){printf("%s",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }' | sort -T $tmp_dir -k1,1 -t $'\t' > $f1_tmp.sorted &

fi

if [[ ! -e "$f2_tmp.sorted" ]]; then
    echo "Sorting $f2_name"
    cat $f2_path | awk '{split($0, a, " "); sub(/\/2$/,"\t", a[1]); n++; if (n%4==1){printf("%s",a[1]);}; printf("%s",$0); if(n%4==0) { printf("\n");} else { printf("\t");} }' | sort -T $tmp_dir -k1,1 -t $'\t' > $f2_tmp.sorted &
fi
wait

echo "Starting name extraction"
if [[ ! -e "$f1_tmp.names" ]]; then
    echo "Extracting names from $f1_tmp.sorted"
    # cat $1.sorted | cut -f1 | sed 's/\(.*\)\/1$/\1/' > $1.names &
    cat $f1_tmp.sorted | cut -f1  > $f1_tmp.names &
fi
if [[ ! -e "$f2_tmp.names" ]]; then
    echo "Extracting names from $f2_tmp.sorted"
    cat $f2_tmp.sorted | cut -f1  > $f2_tmp.names &
fi
wait

echo "Starting names comparison"
if [[ ! -e "$common_names" ]]; then
    echo "Making comm file"
    # diff $1.names $2.names > names.diff
    comm $f1_tmp.names $f2_tmp.names > $common_names
fi

echo "Starting names extraction"
# grep '^>' names.diff | cut -d ' ' -f2 | awk '{ printf("%s/2\n",$0) }' > only_in_2.txt &
# grep '^<' names.diff | cut -d ' ' -f2 | awk '{ printf("%s/1\n",$0) }' > only_in_1.txt &

grep -P '^[^\t]' $common_names > $only_in_1 &
grep -P '^\t[^\t]' $common_names |tr -d "\t" > $only_in_2 &
grep -P '^\t\t[^\t]' $common_names |tr -d "\t" > $in_both &
wait

echo "Num seqs only in 1) $f1_name"
wc -l $only_in_1

echo "Num seqs only in 2) $f2_name"
wc -l $only_in_2

echo "Num seqs in both $f1_name and $f2_name"
wc -l $in_both

echo "Starting extracting seqs"
join -t $'\t' -1 1 -2 1 $only_in_1 $f1_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_normal1.fastq &
join -t $'\t' -1 1 -2 1 $only_in_2 $f2_tmp.sorted |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_normal2.fastq &

join -t $'\t' -1 1 -2 1 $in_both $f1_tmp.sorted  |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_paired1.fastq &
join -t $'\t' -1 1 -2 1 $in_both $f2_tmp.sorted  |cut -f 2,3,4,5| tr "\t" "\n" > ${base_name}_paired2.fastq &
wait

rm $f1_tmp.names
rm $f2_tmp.names

rm $f1_tmp.sorted
rm $f2_tmp.sorted

rm $only_in_2
rm $only_in_1
rm $in_both

rm $common_names