Repository 'shm_csr'
hg clone https://toolshed.g2.bx.psu.edu/repos/davidvanzessen/shm_csr

Changeset 63:8728284105ee (2017-12-06)
Previous changeset 62:aa8d37bd1930 (2017-12-05) Next changeset 64:c6dd3215ebe0 (2017-12-06)
Commit message:
Uploaded
modified:
baseline/filter.r
baseline/script_imgt.py
baseline/wrapper.sh
shm_csr.py
b
diff -r aa8d37bd1930 -r 8728284105ee baseline/filter.r
--- a/baseline/filter.r Tue Dec 05 10:57:13 2017 -0500
+++ b/baseline/filter.r Wed Dec 06 08:04:52 2017 -0500
[
@@ -9,6 +9,20 @@
 summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
 gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
 
+fix_column_names = function(df){
+    if("V.DOMAIN.Functionality" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
+        print("found V.DOMAIN.Functionality, changed")
+    }
+    if("V.DOMAIN.Functionality.comment" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
+        print("found V.DOMAIN.Functionality.comment, changed")
+    }
+    return(df)
+}
+
+gappeddat = fix_column_names(gappeddat)
+
 #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
 
 dat = cbind(gappeddat, summarydat$AA.JUNCTION)
@@ -24,12 +38,18 @@
 dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
 dat$JGene = gsub("[*].*", "", dat$JGene)
 
-#print(str(dat))
+print(str(dat))
 
 dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
 
 dat = dat[!duplicated(dat$past), ]
 
+print(paste("Sequences remaining after duplicate filter:", nrow(dat)))
+
 dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
 
+print(paste("Sequences remaining after functionality filter:", nrow(dat)))
+
+print(paste("Sequences remaining:", nrow(dat)))
+
 write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)
b
diff -r aa8d37bd1930 -r 8728284105ee baseline/script_imgt.py
--- a/baseline/script_imgt.py Tue Dec 05 10:57:13 2017 -0500
+++ b/baseline/script_imgt.py Wed Dec 06 08:04:52 2017 -0500
[
@@ -10,12 +10,18 @@
 
 args = parser.parse_args()
 
+print "script_imgt.py"
+print "input:", args.input
+print "ref:", args.ref
+print "output:", args.output
+print "id:", args.id
+
 refdic = dict()
 with open(args.ref, 'rU') as ref:
  currentSeq = ""
  currentId = ""
  for line in ref:
- if line[0] is ">":
+ if line.startswith(">"):
  if currentSeq is not "" and currentId is not "":
  refdic[currentId[1:]] = currentSeq
  currentId = line.rstrip()
@@ -23,7 +29,8 @@
  else:
  currentSeq += line.rstrip()
  refdic[currentId[1:]] = currentSeq
-
+
+print "Have", str(len(refdic)), "reference sequences"
 
 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
@@ -37,16 +44,13 @@
 vPattern = re.compile("|".join(vPattern))
 
 def filterGene(s, pattern):
- s1 = s[s.find(" ") + 1:]
- return s1[:s1.find(" ")]
- """
     if type(s) is not str:
         return None
     res = pattern.search(s)
     if res:
         return res.group(0)
     return None
- """
+
 
 
 currentSeq = ""
b
diff -r aa8d37bd1930 -r 8728284105ee baseline/wrapper.sh
--- a/baseline/wrapper.sh Tue Dec 05 10:57:13 2017 -0500
+++ b/baseline/wrapper.sh Wed Dec 06 08:04:52 2017 -0500
[
@@ -41,7 +41,7 @@
 do
  f=$(file $current)
  zipType="Zip archive"
- if [[ "$f" == *"$zipType"* ]] || [[ "$f" == *"XZ compressed data"* ]]
+ if [[ "$f" == *"Zip archive"* ]] || [[ "$f" == *"XZ compressed data"* ]]
  then
  id=${IDs[$count]}
  echo "id=$id"
@@ -55,19 +55,13 @@
  mkdir -p "$PWD/$id/files"
  tar -xJf $current -C "$PWD/$id/files/"
  fi
- summaryfile="$PWD/summary_${id}.txt"
- gappedfile="$PWD/gappednt_${id}.txt"
  filtered="$PWD/filtered_${id}.txt"
- filecount=`ls -l $PWD/$id/ | wc -l`
- if [[ "$filecount" -eq "2" ]]
- then
- cat $PWD/$id/*/1_* > $summaryfile
- cat $PWD/$id/*/2_* > $gappedfile
- else
- cat $PWD/$id/1_* > $summaryfile
- cat $PWD/$id/2_* > $gappedfile
- fi
- Rscript $dir/filter.r $summaryfile $gappedfile "$selection" $filtered 2>&1
+ imgt_1_file="`find $PWD/$id -name '1_*.txt'`"
+ imgt_2_file="`find $PWD/$id -name '2_*.txt'`"
+ echo "1_Summary file: ${imgt_1_file}"
+ echo "2_IMGT-gapped file: ${imgt_2_file}"
+ echo "filter.r for $id"
+ Rscript $dir/filter.r ${imgt_1_file} ${imgt_2_file} "$selection" $filtered 2>&1
 
  final="$PWD/final_${id}.txt"
  cat $filtered | cut -f2,4,7 > $final
@@ -77,12 +71,6 @@
  fi
  count=$((count+1))
 done
-
-if [[ $(wc -l < $fasta) -eq "1" ]]; then
- echo "No sequences in the fasta file, exiting"
- exit 0
-fi
-
 workdir="$PWD"
 cd $dir
 echo "file: ${inputs[0]}"
b
diff -r aa8d37bd1930 -r 8728284105ee shm_csr.py
--- a/shm_csr.py Tue Dec 05 10:57:13 2017 -0500
+++ b/shm_csr.py Wed Dec 06 08:04:52 2017 -0500
[
@@ -104,23 +104,6 @@
  if len(linesplt[fr3Index]) > 5:
  mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
 
- try:
- pass
- except Exception as e:
- print "Something went wrong while processing this line:"
- print "line:", linecount
- print "fr1 len:", len(linesplt[fr1Index]), "value:", linesplt[fr1Index]
- print "cdr1 len:", len(linesplt[cdr1Index]), "value:", linesplt[cdr1Index]
- print "fr2 len:", len(linesplt[fr2Index]), "value:", linesplt[fr2Index]
- print "cdr2 len:", len(linesplt[cdr2Index]), "value:", linesplt[cdr2Index]
- print "fr3 len:", len(linesplt[fr3Index]), "value:", linesplt[fr3Index]
- print ID + "_FR1 in mutationdic", ID + "_FR1" in mutationdic
- print ID + "_CDR1 in mutationdic", ID + "_CDR1" in mutationdic
- print ID + "_FR2 in mutationdic", ID + "_FR2" in mutationdic
- print ID + "_CDR2 in mutationdic", ID + "_CDR2" in mutationdic
- print ID + "_FR3 in mutationdic", ID + "_FR3" in mutationdic
- print linesplt
- print e
  mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
  mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 
@@ -393,6 +376,50 @@
  WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs
  WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs
  TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs
+
+ mutations_in_motifs_file = os.path.join(os.path.dirname(os.path.abspath(infile)), "mutation_in_motifs.txt")
+ if not os.path.exists(mutation_by_id_file):
+ with open(mutations_in_motifs_file, 'w') as out_handle:
+ out_handle.write("{0}\n".format("\t".join([
+ "Sequence.ID",
+ "mutation_position",
+ "region",
+ "from_nt",
+ "to_nt",
+ "mutation_position_AA",
+ "from_AA",
+ "to_AA",
+ "motif",
+ "motif_start_nt",
+ "motif_end_nt",
+ "rest"
+ ])))
+
+ with open(mutations_in_motifs_file, 'a') as out_handle:
+ motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW}
+ for mutation in mutationList:
+ frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
+ for motif in motif_dic.keys():
+
+ for start, end, region in motif_dic[motif]:
+ if start <= int(where) <= end:
+ out_handle.write("{0}\n".format(
+ "\t".join([
+ ID,
+ where,
+ region,
+ frm,
+ to,
+ str(AAwhere),
+ str(AAfrm),
+ str(AAto),
+ motif,
+ str(start),
+ str(end),
+ str(junk)
+ ])
+ ))
+
 
 
  def mean(lst):