# HG changeset patch # User davidvanzessen # Date 1512565492 18000 # Node ID 8728284105eeae4160406cdbb6a2c90a602326b0 # Parent aa8d37bd1930590f6ebe57c1b7a18bd34201c922 Uploaded diff -r aa8d37bd1930 -r 8728284105ee baseline/filter.r --- a/baseline/filter.r Tue Dec 05 10:57:13 2017 -0500 +++ b/baseline/filter.r Wed Dec 06 08:04:52 2017 -0500 @@ -9,6 +9,20 @@ summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F) gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F) +fix_column_names = function(df){ + if("V.DOMAIN.Functionality" %in% names(df)){ + names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality" + print("found V.DOMAIN.Functionality, changed") + } + if("V.DOMAIN.Functionality.comment" %in% names(df)){ + names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment" + print("found V.DOMAIN.Functionality.comment, changed") + } + return(df) +} + +gappeddat = fix_column_names(gappeddat) + #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T)) dat = cbind(gappeddat, summarydat$AA.JUNCTION) @@ -24,12 +38,18 @@ dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele) dat$JGene = gsub("[*].*", "", dat$JGene) -#print(str(dat)) +print(str(dat)) dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":")) dat = dat[!duplicated(dat$past), ] +print(paste("Sequences remaining after duplicate filter:", nrow(dat))) + dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",] +print(paste("Sequences remaining after functionality filter:", nrow(dat))) + +print(paste("Sequences remaining:", nrow(dat))) + write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T) diff -r aa8d37bd1930 -r 8728284105ee baseline/script_imgt.py --- a/baseline/script_imgt.py Tue Dec 05 10:57:13 2017 -0500 +++ b/baseline/script_imgt.py Wed Dec 06 08:04:52 2017 -0500 @@ -10,12 +10,18 @@ args = parser.parse_args() +print "script_imgt.py" +print "input:", args.input +print "ref:", args.ref +print "output:", args.output +print "id:", args.id + refdic = dict() with open(args.ref, 'rU') as ref: currentSeq = "" currentId = "" for line in ref: - if line[0] is ">": + if line.startswith(">"): if currentSeq is not "" and currentId is not "": refdic[currentId[1:]] = currentSeq currentId = line.rstrip() @@ -23,7 +29,8 @@ else: currentSeq += line.rstrip() refdic[currentId[1:]] = currentSeq - + +print "Have", str(len(refdic)), "reference sequences" vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", @@ -37,16 +44,13 @@ vPattern = re.compile("|".join(vPattern)) def filterGene(s, pattern): - s1 = s[s.find(" ") + 1:] - return s1[:s1.find(" ")] - """ if type(s) is not str: return None res = pattern.search(s) if res: return res.group(0) return None - """ + currentSeq = "" diff -r aa8d37bd1930 -r 8728284105ee baseline/wrapper.sh --- a/baseline/wrapper.sh Tue Dec 05 10:57:13 2017 -0500 +++ b/baseline/wrapper.sh Wed Dec 06 08:04:52 2017 -0500 @@ -41,7 +41,7 @@ do f=$(file $current) zipType="Zip archive" - if [[ "$f" == *"$zipType"* ]] || [[ "$f" == *"XZ compressed data"* ]] + if [[ "$f" == *"Zip archive"* ]] || [[ "$f" == *"XZ compressed data"* ]] then id=${IDs[$count]} echo "id=$id" @@ -55,19 +55,13 @@ mkdir -p "$PWD/$id/files" tar -xJf $current -C "$PWD/$id/files/" fi - summaryfile="$PWD/summary_${id}.txt" - gappedfile="$PWD/gappednt_${id}.txt" filtered="$PWD/filtered_${id}.txt" - filecount=`ls -l $PWD/$id/ | wc -l` - if [[ "$filecount" -eq "2" ]] - then - cat $PWD/$id/*/1_* > $summaryfile - cat $PWD/$id/*/2_* > $gappedfile - else - cat $PWD/$id/1_* > $summaryfile - cat $PWD/$id/2_* > $gappedfile - fi - Rscript $dir/filter.r $summaryfile $gappedfile "$selection" $filtered 2>&1 + imgt_1_file="`find $PWD/$id -name '1_*.txt'`" + imgt_2_file="`find $PWD/$id -name '2_*.txt'`" + echo "1_Summary file: ${imgt_1_file}" + echo "2_IMGT-gapped file: ${imgt_2_file}" + echo "filter.r for $id" + Rscript $dir/filter.r ${imgt_1_file} ${imgt_2_file} "$selection" $filtered 2>&1 final="$PWD/final_${id}.txt" cat $filtered | cut -f2,4,7 > $final @@ -77,12 +71,6 @@ fi count=$((count+1)) done - -if [[ $(wc -l < $fasta) -eq "1" ]]; then - echo "No sequences in the fasta file, exiting" - exit 0 -fi - workdir="$PWD" cd $dir echo "file: ${inputs[0]}" diff -r aa8d37bd1930 -r 8728284105ee shm_csr.py --- a/shm_csr.py Tue Dec 05 10:57:13 2017 -0500 +++ b/shm_csr.py Wed Dec 06 08:04:52 2017 -0500 @@ -104,23 +104,6 @@ if len(linesplt[fr3Index]) > 5: mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] - try: - pass - except Exception as e: - print "Something went wrong while processing this line:" - print "line:", linecount - print "fr1 len:", len(linesplt[fr1Index]), "value:", linesplt[fr1Index] - print "cdr1 len:", len(linesplt[cdr1Index]), "value:", linesplt[cdr1Index] - print "fr2 len:", len(linesplt[fr2Index]), "value:", linesplt[fr2Index] - print "cdr2 len:", len(linesplt[cdr2Index]), "value:", linesplt[cdr2Index] - print "fr3 len:", len(linesplt[fr3Index]), "value:", linesplt[fr3Index] - print ID + "_FR1 in mutationdic", ID + "_FR1" in mutationdic - print ID + "_CDR1 in mutationdic", ID + "_CDR1" in mutationdic - print ID + "_FR2 in mutationdic", ID + "_FR2" in mutationdic - print ID + "_CDR2 in mutationdic", ID + "_CDR2" in mutationdic - print ID + "_FR3 in mutationdic", ID + "_FR3" in mutationdic - print linesplt - print e mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] @@ -393,6 +376,50 @@ WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs + + mutations_in_motifs_file = os.path.join(os.path.dirname(os.path.abspath(infile)), "mutation_in_motifs.txt") + if not os.path.exists(mutation_by_id_file): + with open(mutations_in_motifs_file, 'w') as out_handle: + out_handle.write("{0}\n".format("\t".join([ + "Sequence.ID", + "mutation_position", + "region", + "from_nt", + "to_nt", + "mutation_position_AA", + "from_AA", + "to_AA", + "motif", + "motif_start_nt", + "motif_end_nt", + "rest" + ]))) + + with open(mutations_in_motifs_file, 'a') as out_handle: + motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW} + for mutation in mutationList: + frm, where, to, AAfrm, AAwhere, AAto, junk = mutation + for motif in motif_dic.keys(): + + for start, end, region in motif_dic[motif]: + if start <= int(where) <= end: + out_handle.write("{0}\n".format( + "\t".join([ + ID, + where, + region, + frm, + to, + str(AAwhere), + str(AAfrm), + str(AAto), + motif, + str(start), + str(end), + str(junk) + ]) + )) + def mean(lst):