annotate imgt_concatenate.sh @ 2:d77d4700fd0a draft

Uploaded
author davidvanzessen
date Tue, 27 Dec 2016 10:11:21 -0500
parents b360a373835f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
1 #!/bin/bash
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
2 dir="$(cd "$(dirname "$0")" && pwd)"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
3
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
4 args=("$@")
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
5 output=$1
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
6 inputs=("${args[@]:1}")
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
7
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
8 workdir="$PWD"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
9
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
10 echo "Output: $output"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
11 echo "Inputs: ${inputs[@]}"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
12
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
13 mkdir "$workdir/output"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
14
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
15 function imgt_unpack {
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
16 local imgt_zip=$1
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
17 local outdir=$2
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
18 if [ ! -d "$outdir" ]; then
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
19 mkdir "$outdir"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
20 fi
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
21 local type="`file $imgt_zip`"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
22 if [[ "$type" == *"Zip archive"* ]] ; then
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
23 unzip $imgt_zip -d $outdir
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
24 elif [[ "$type" == *"XZ compressed data"* ]] ; then
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
25 mkdir -p $outdir
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
26 echo "tar -xJf $imgt_zip -C $outdir"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
27 tar -xJf $imgt_zip -C $outdir
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
28 fi
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
29 }
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
30
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
31 function concat_imgt_files {
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
32 indir=$1
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
33 outdir=$2
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
34 start_line=$3 #line # to start at, 2 to skip header
1
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
35 id=$4
2
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
36 if [[ "${start_line}" == "1" ]] ; then
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
37 cat `find $indir/ -name "1_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/1_Summary.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
38 cat `find $indir/ -name "2_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/2_IMGT-gapped-nt-sequences.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
39 cat `find $indir/ -name "3_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/3_Nt-sequences.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
40 cat `find $indir/ -name "4_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/4_IMGT-gapped-AA-sequences.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
41 cat `find $indir/ -name "5_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/5_AA-sequences.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
42 cat `find $indir/ -name "6_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/6_Junction.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
43 cat `find $indir/ -name "7_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/7_V-REGION-mutation-and-AA-change-table.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
44 cat `find $indir/ -name "8_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/8_V-REGION-nt-mutation-statistics.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
45 cat `find $indir/ -name "9_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/9_V-REGION-AA-change-statistics.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
46 cat `find $indir/ -name "10_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/10_V-REGION-mutation-hotspots.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
47 else
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
48 cat `find $indir/ -name "1_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/1_Summary.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
49 cat `find $indir/ -name "2_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/2_IMGT-gapped-nt-sequences.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
50 cat `find $indir/ -name "3_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/3_Nt-sequences.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
51 cat `find $indir/ -name "4_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/4_IMGT-gapped-AA-sequences.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
52 cat `find $indir/ -name "5_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/5_AA-sequences.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
53 cat `find $indir/ -name "6_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/6_Junction.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
54 cat `find $indir/ -name "7_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/7_V-REGION-mutation-and-AA-change-table.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
55 cat `find $indir/ -name "8_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/8_V-REGION-nt-mutation-statistics.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
56 cat `find $indir/ -name "9_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/9_V-REGION-AA-change-statistics.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
57 cat `find $indir/ -name "10_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/10_V-REGION-mutation-hotspots.txt"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
58 fi
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
59
0
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
60 }
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
61
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
62 echo "Unpacking IMGT file 1.."
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
63 imgt_unpack ${inputs[0]} "$workdir/input1"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
64
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
65 echo "Concatenating IMGT file 1..."
1
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
66 id=${inputs[1]}
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
67 concat_imgt_files "$workdir/input1" "$workdir/output" 1 $id
0
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
68
1
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
69 remaining_inputs=("${inputs[@]:2}")
0
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
70
1
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
71 i="0"
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
72 while [ $i -lt ${#remaining_inputs[@]} ]; do
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
73 j=$((i+1))
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
74 input="${remaining_inputs[$i]}"
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
75 id="${remaining_inputs[$j]}"
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
76
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
77 echo "Unpacking IMGT file $j.."
0
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
78 current_dir="$workdir/input${i}"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
79 imgt_unpack "${input}" "${current_dir}"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
80 echo "Concatenating IMGT file $1..."
1
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
81 concat_imgt_files "${current_dir}" "$workdir/output" 2 $id
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
82 i=$((i+2))
0
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
83 done
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
84
2
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
85 echo "`head $workdir/output/1_Summary.txt`"
d77d4700fd0a Uploaded
davidvanzessen
parents: 1
diff changeset
86
1
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
87
0
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
88 echo "Creating new IMGT zip"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
89 cd "$workdir/output"
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
90 tar cfJ "$output" *
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
91
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
92 #awk to fix the sequence numbers repeating?
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
93
d3cf09f5a1a6 Uploaded
davidvanzessen
parents:
diff changeset
94 echo "Done"
1
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
95
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
96 exit 0
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
97
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
98 i="1"
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
99 for input in "${remaining_inputs[@]}"
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
100 do
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
101 echo "Unpacking IMGT file $i.."
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
102 current_dir="$workdir/input${i}"
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
103 imgt_unpack "${input}" "${current_dir}"
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
104 echo "Concatenating IMGT file $1..."
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
105 concat_imgt_files "${current_dir}" "$workdir/output" 2 $id
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
106 i=$((i+1))
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
107 done
b360a373835f Uploaded
davidvanzessen
parents: 0
diff changeset
108