# HG changeset patch # User davidvanzessen # Date 1472463988 14400 # Node ID d3cf09f5a1a65779cc3281b6103bfe7c348dcf01 Uploaded diff -r 000000000000 -r d3cf09f5a1a6 imgt_concatenate.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgt_concatenate.sh Mon Aug 29 05:46:28 2016 -0400 @@ -0,0 +1,72 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +args=("$@") +output=$1 +inputs=("${args[@]:1}") + +workdir="$PWD" + +echo "Output: $output" +echo "Inputs: ${inputs[@]}" + +mkdir "$workdir/output" + +function imgt_unpack { + local imgt_zip=$1 + local outdir=$2 + if [ ! -d "$outdir" ]; then + mkdir "$outdir" + fi + local type="`file $imgt_zip`" + if [[ "$type" == *"Zip archive"* ]] ; then + unzip $imgt_zip -d $outdir + elif [[ "$type" == *"XZ compressed data"* ]] ; then + mkdir -p $outdir + echo "tar -xJf $imgt_zip -C $outdir" + tar -xJf $imgt_zip -C $outdir + fi +} + +function concat_imgt_files { + indir=$1 + outdir=$2 + start_line=$3 #line # to start at, 2 to skip header + cat `find $indir/ -name "1_*"` | tail -n+${start_line} >> "$outdir/1_Summary.txt" + cat `find $indir/ -name "2_*"` | tail -n+${start_line} >> "$outdir/2_IMGT-gapped-nt-sequences.txt" + cat `find $indir/ -name "3_*"` | tail -n+${start_line} >> "$outdir/3_Nt-sequences.txt" + cat `find $indir/ -name "4_*"` | tail -n+${start_line} >> "$outdir/4_IMGT-gapped-AA-sequences.txt" + cat `find $indir/ -name "5_*"` | tail -n+${start_line} >> "$outdir/5_AA-sequences.txt" + cat `find $indir/ -name "6_*"` | tail -n+${start_line} >> "$outdir/6_Junction.txt" + cat `find $indir/ -name "7_*"` | tail -n+${start_line} >> "$outdir/7_V-REGION-mutation-and-AA-change-table.txt" + cat `find $indir/ -name "8_*"` | tail -n+${start_line} >> "$outdir/8_V-REGION-nt-mutation-statistics.txt" + cat `find $indir/ -name "9_*"` | tail -n+${start_line} >> "$outdir/9_V-REGION-AA-change-statistics.txt" + cat `find $indir/ -name "10_*"` | tail -n+${start_line} >> "$outdir/10_V-REGION-mutation-hotspots.txt" +} + +echo "Unpacking IMGT file 1.." +imgt_unpack ${inputs[0]} "$workdir/input1" + +echo "Concatenating IMGT file 1..." +concat_imgt_files "$workdir/input1" "$workdir/output" 1 + +remaining_inputs=("${inputs[@]:1}") + +i="2" +for input in "${remaining_inputs[@]}" +do + echo "Unpacking IMGT file $i.." + current_dir="$workdir/input${i}" + imgt_unpack "${input}" "${current_dir}" + echo "Concatenating IMGT file $1..." + concat_imgt_files "${current_dir}" "$workdir/output" 2 + i=$((i+1)) +done + +echo "Creating new IMGT zip" +cd "$workdir/output" +tar cfJ "$output" * + +#awk to fix the sequence numbers repeating? + +echo "Done" diff -r 000000000000 -r d3cf09f5a1a6 imgt_concatenate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgt_concatenate.xml Mon Aug 29 05:46:28 2016 -0400 @@ -0,0 +1,25 @@ + + + + imgt_concatenate.sh $out_file +#for $i, $f in enumerate($files) + "$f.file" +#end for + + + + + + + + + + + + + +Concatenate two or more IMGT files to create one new IMGT zip with all the sequences of the inputs. + + + +