Mercurial > repos > chemteam > mdanalysis_cosine_analysis
changeset 4:a842da7ef42b draft
"planemo upload for repository https://github.com/galaxycomputationalchemistry/galaxy-tools-compchem/ commit 1b23e024af45cc0999d9142d07de6897d4189ec2"
author | chemteam |
---|---|
date | Mon, 24 Aug 2020 16:37:11 -0400 |
parents | 9a3a1f698fc6 |
children | d60c274980f7 |
files | end-to-end.py extract_rmsd.py macros.xml pca_cosine.xml ramachandran_auto_protein.py ramachandran_auto_protein_html.j2 ramachandran_plots.py test-data/test.yml |
diffstat | 8 files changed, 498 insertions(+), 63 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/end-to-end.py Mon Aug 24 16:37:11 2020 -0400 @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +import argparse +import itertools +import sys + +import MDAnalysis as mda + +import matplotlib +import matplotlib.pyplot as plt + +import numpy as np +import numpy.linalg + +matplotlib.use('Agg') # noqa + + +def parse_command_line(argv): + parser = argparse.ArgumentParser() + parser.add_argument('--itraj', help='input traj') + parser.add_argument('--istr', help='input str') + parser.add_argument('--itrajext', help='input traj ext') + parser.add_argument('--istrext', help='input str ext') + parser.add_argument('--isegid1', help='segid 1') + parser.add_argument('--ilabel', help='plot label') + parser.add_argument('--ititle1', help='plot title') + parser.add_argument('--output1', help='output1 - timeseries') + parser.add_argument('--o_plot', help='End to End plot') + return parser.parse_args() + + +args = parse_command_line(sys.argv) + + +u = mda.Universe(args.istr, args.itraj, + topology_format=args.istrext, format=args.itrajext) + +ntermatoms = "(segid %s and name N)" % \ + (args.isegid1) +ctermatoms = "(segid %s and name C)" % \ + (args.isegid1) +# not sure how robust this selection really is +nterm = u.select_atoms(ntermatoms)[0] # first atom named N +cterm = u.select_atoms(ctermatoms)[-1] # takes the last atom named 'C' + +enddist = [] + +for ts in u.trajectory: # iterate through all frames + r = cterm.position - nterm.position # e-to-e vector from atom positions + d = numpy.linalg.norm(r) # end-to-end distance + enddist.append((ts.frame, d)) + +enddist = np.array(enddist) + + +color = itertools.cycle(['r', 'b', 'gold']) + +fig, axs = plt.subplots(1, 2, sharex=False, sharey=False, tight_layout=True) + +params = { + 'axes.labelsize': 8, + 'legend.fontsize': 10, + 'xtick.labelsize': 10, + 'ytick.labelsize': 10, + 'text.usetex': False, + 'figure.figsize': [4.5, 4.5], + 'figure.dpi': 300 +} +plt.rcParams.update(params) + +axs[0].plot(enddist[:, 0], enddist[:, 1], 'r-', lw=2, label=args.ilabel) +axs[0].set_xlabel("number of frames") +axs[0].set_ylabel(r"End to end distance ($\AA$)") +axs[0].legend() + +n, bins, patches = axs[1].hist(enddist[:, 1], color=next( + color), label=args.ilabel, alpha=0.5, density=True, stacked=True) + +axs[1].legend() +axs[1].set_ylabel('Density Normalised Frequency') +axs[1].set_xlabel(r'End to end distance ($\AA$)') +fig.suptitle(args.ititle1, fontsize=12, fontweight='bold') +fig.subplots_adjust(top=0.45) + +print( + " \n".join( + [ + 'The End to End distance is measured between the following atoms:', + str(nterm), + str(cterm)])) + +# svg is better but sticking with png for now +plt.savefig(args.o_plot, format='png') + + +np.savetxt(args.output1, enddist, delimiter='\t')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_rmsd.py Mon Aug 24 16:37:11 2020 -0400 @@ -0,0 +1,129 @@ +import argparse +import json + +import MDAnalysis as m +from MDAnalysis.analysis import align, rms +from MDAnalysis.analysis.base import AnalysisFromFunction +from MDAnalysis.coordinates.memory import MemoryReader + +import numpy as np + + +def calc_rmsd(str_files, traj_files, ref_str, str_format, traj_format, + ref_str_format, filepath_out, group, start, end, step, + fitting_atoms): + """ + the function will cycle through range 0 to no_t and load all files found. + + str_files: text file with filepaths for structures, one on each line + traj_files: text file with filepaths for trajectories, one on each line + ref_str: reference structure for fitting + filepath_in: directory where the files are located + filepath_out: pickle file where results (3D matrix) should be saved to + + group: atoms for which RMSD should be calculated; + use the MDAnalysis selection language + fitting_atoms: atoms used for str alignment prior to RMSD calculation; + use the MDAnalysis selection language + + start: first trajectory frame to calculate RMSD + end: last trajectory frame to calculate RMSD + step: how frequently frames are sampled between start and end; obviously, + the larger the step, the quicker the script finishes + """ + + # open list of files + with open(str_files) as f1, open(traj_files) as f2: + str_file_list = f1.read().strip().split('\n') + traj_file_list = f2.read().strip().split('\n') + + if sum(1 for line in f1) != sum(1 for line in f2): + raise IOError('Number of structure and trajectory files unequal.') + + no_t = len(traj_file_list) + + data = np.zeros((no_t, no_t, + int((end - start)/step + ((end - start) % step > 0)))) + + # load files + universes = {} + + for traj in range(no_t): + mobile = m.Universe(str_file_list[traj], traj_file_list[traj], + format=traj_format, topology_format=str_format) + ref = m.Universe(ref_str, topology_format=ref_str_format) + + mobile.trajectory[-1] # set mobile trajectory to last frame + ref.trajectory[0] # set reference trajectory to first frame + + # perform alignment + align.AlignTraj(mobile, ref, select=fitting_atoms, + in_memory=True).run() + + grp = mobile.select_atoms(group) + universes[traj] = m.core.universe.Merge(grp) # create Universe w grp + coordinates = AnalysisFromFunction(lambda ag: ag.positions.copy(), + grp).run().results # write to uv + universes[traj].load_new(coordinates, format=MemoryReader) + + print("All trajs loaded by MDAnalysis") + + # calculate differences + for traj1 in range(no_t): + print("Calculating differences for traj {}".format(traj1)) + for traj2 in range(traj1): + + u1 = universes[traj1] + u2 = universes[traj2] + + l1 = u1.select_atoms(group) + l2 = u2.select_atoms(group) + + rmsd = rms.RMSD(l1, l2) + + rmsd.run() + + data[traj1, traj2] = rmsd.rmsd[:, 2] + data[traj2, traj1] = rmsd.rmsd[:, 2] + + with open(filepath_out, 'w') as f: + json.dump(data.tolist(), f, indent=4, sort_keys=True) + + print("Done!") + return + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--trajs', required=True, + help='File containing trajectory filepaths.') + parser.add_argument("--strs", + help='File containing structure filepaths.') + parser.add_argument("--ref-str", + help='File containing reference structure.') + parser.add_argument('--traj-format', required=True, + help='Trajectory format.') + parser.add_argument("--str-format", help='Structure format.') + parser.add_argument("--ref-str-format", + help='Reference structure format.') + parser.add_argument('-o', '--outfile', + help="Path to the output JSON file") + parser.add_argument('--group', help="Atoms for which RMSD should be" + "calculated in MDAnalysis selection language") + parser.add_argument('--fitting', help="Fitting atoms for alignment" + "prior to RMSD calculation") + parser.add_argument('--start', type=int, + help="First trajectory frame to calculate RMSD") + parser.add_argument('--end', type=int, + help="Last trajectory frame to calculate RMSD") + parser.add_argument('--step', type=int, + help="Frame sampling frequency for RMSD calculation") + args = parser.parse_args() + + calc_rmsd(args.strs, args.trajs, args.ref_str, args.str_format, + args.traj_format, args.ref_str_format, args.outfile, + args.group, args.start, args.end, args.step, args.fitting) + + +if __name__ == "__main__": + main()
--- a/macros.xml Wed May 20 13:03:58 2020 -0400 +++ b/macros.xml Mon Aug 24 16:37:11 2020 -0400 @@ -1,36 +1,36 @@ <macros> - <token name="@VERSION@">0.20</token> + <token name="@TOOL_VERSION@">1.0.0</token> <xml name="requirements"> <requirements> - <requirement type="package" version="0.20.1">mdanalysis</requirement> + <requirement type="package" version="@TOOL_VERSION@">mdanalysis</requirement> <yield/> </requirements> </xml> <xml name="analysis_inputs"> - <param format="dcd,xtc" name="trajin" type="data" label="DCD/XTC trajectory input"/> - <param format="pdb,gro" name="strin" type="data" label="PDB/GRO input"/> + <param format="dcd,xtc" name="trajin" type="data" label="DCD/XTC trajectory input" /> + <param format="pdb,gro" name="strin" type="data" label="PDB/GRO input" /> <yield/> </xml> <xml name="sanitizer"> <sanitizer invalid_char=""> - <valid initial="string.ascii_letters,string.digits"/> - </sanitizer> - <yield/> + <valid initial="string.ascii_letters,string.digits" /> + </sanitizer> + <yield/> </xml> <xml name="sanitizer_resids"> <sanitizer invalid_char=""> - <valid initial="string.digits"/> + <valid initial="string.digits" /> </sanitizer> <yield/> </xml> <xml name="tests_inputs"> - <param name="trajin" value="test.dcd" ftype="dcd"/> - <param name="strin" value="test.pdb" ftype="pdb"/> + <param name="trajin" value="test.dcd" ftype="dcd" /> + <param name="strin" value="test.pdb" ftype="pdb" /> <yield/> </xml> <xml name="tests_inputs_gmx"> - <param name="trajin" value="test.xtc" ftype="xtc"/> - <param name="strin" value="test.gro" ftype="gro"/> + <param name="trajin" value="test.xtc" ftype="xtc" /> + <param name="strin" value="test.gro" ftype="gro" /> <yield/> </xml> <xml name="citations">
--- a/pca_cosine.xml Wed May 20 13:03:58 2020 -0400 +++ b/pca_cosine.xml Mon Aug 24 16:37:11 2020 -0400 @@ -1,7 +1,8 @@ -<tool id="mdanalysis_cosine_analysis" name="Cosine Content" version="@VERSION@"> +<tool id="mdanalysis_cosine_analysis" name="Cosine Content" version="@TOOL_VERSION@+galaxy@GALAXY_VERSION@"> <description>- measure the cosine content of the PCA projection</description> <macros> <import>macros.xml</import> + <token name="@GALAXY_VERSION@">0</token> </macros> <expand macro="requirements" /> <command detect_errors="exit_code">
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ramachandran_auto_protein.py Mon Aug 24 16:37:11 2020 -0400 @@ -0,0 +1,163 @@ +#!/usr/bin/env python + +import argparse +import base64 +import importlib +import sys + +import MDAnalysis as mda +from MDAnalysis.analysis.dihedrals import Ramachandran + +import h5py + +from jinja2 import Environment, FileSystemLoader + +import matplotlib +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker + +import numpy as np +import numpy.linalg + +import seaborn as sns + + +matplotlib.use('Agg') # noqa + + +def parse_command_line(argv): + parser = argparse.ArgumentParser() + parser.add_argument('--itraj', help='input traj') + parser.add_argument('--istr', help='input str') + parser.add_argument('--itrajext', help='input traj ext') + parser.add_argument('--istrext', help='input str ext') + parser.add_argument('--isegid1', help='segid 1') + parser.add_argument('--iresid1', help='resid start') + parser.add_argument('--iresid2', help='resid end') + parser.add_argument('--iresname', help='resname e.g. ALA') + parser.add_argument('--igroupby', help='groupby names or ids') + parser.add_argument('--itemplatepath', help='template path') + parser.add_argument('--o_plot1', help='MDA Ramachandran plot') + parser.add_argument('--o_plot2', help='Seaborn Ramachandran plot') + parser.add_argument('--o_data1', help='Timeseries in HDF5 format') + parser.add_argument('--o_html1', help='Html overview output of all plots') + return parser.parse_args() + + +args = parse_command_line(sys.argv) + +currentpath = "." +if args.itemplatepath is not None: + currentpath = args.itemplatepath + + +u = mda.Universe(args.istr, args.itraj, + topology_format=args.istrext, format=args.itrajext) +selection = "(segid %s)" % \ + (args.isegid1) + +if args.iresname is not None: + selection = "(segid %s and resname %s)" % \ + (args.isegid1, args.iresname) + +if args.iresid1 is not None and args.iresid2 is not None: + assert(int(args.iresid1) > 0), "ResID numbering starts at 1 for this tool." + assert(int(args.iresid2) > 0), "ResID numbering starts at 1 for this tool." + assert(int(args.iresid2) > int(args.iresid1) + ), "ResID2 must be at least ResID1+1" + selection = "(segid %s and resid %s-%s)" % \ + (args.isegid1, int(args.iresid1), int(args.iresid2)) + if args.iresname is not None: + selection = "(segid %s and resid %s-%s and resname %s)" % \ + (args.isegid1, int(args.iresid1), int(args.iresid2), args.iresname) + +r = u.select_atoms(selection) + +assert(r != u.select_atoms('name thiscannotpossiblyexist') + ), \ + """The selection you specified returns an empty result. + Check segment names and residue ID's. Also check the + structure and trajectory file selected are the correct ones""" + +if args.igroupby is not None: + group_selections = {} # dictionary of selections + if args.igroupby == 'name': + groupby = sorted(list(set(r.resnames))) + for e in groupby: + s = r & u.select_atoms("resname %s" % e) + this_sel = "%s and resname %s" % (selection, e) + group_selections[this_sel] = s + elif args.igroupby == 'id': + groupby = sorted(list(set(r.resids))) + for e in groupby: + s = r & u.select_atoms("resid %s" % e) + this_sel = "%s and resid %s" % (selection, e) + group_selections[this_sel] = s + else: + assert False, ("Invalid argument for igroupby. " + "Only name and id are valid options.") + + +def ramachandran_plot(atomgroup, selection, outputfile1, outputfile2, + image_format='png'): + # plot standard mdanalysis and seaborn 2D with kde + R = Ramachandran(atomgroup).run() + fig, ax = plt.subplots(figsize=plt.figaspect(1)) + R.plot(ax=ax, color='k', marker='.', ref=True) + + a = R.angles.reshape(np.prod(R.angles.shape[:2]), 2) + # open hdf file + with h5py.File(args.o_data1, 'a') as f: + setname = "%s" % (selection) + f["/" + setname + "/ramachandran/phi"] = a[:, 0] + f["/" + setname + "/ramachandran/psi"] = a[:, 1] + plt.tight_layout() + # svg is better but sticking with png for now + plt.savefig(outputfile1, format=image_format) + + sns.reset_defaults() + importlib.reload(plt) + importlib.reload(sns) + with sns.axes_style("white"): + h = sns.jointplot(x=a[:, 0], y=a[:, 1], + kind="kde", space=0) + h.set_axis_labels(r'$\phi$ (deg)', r'$\psi$ (deg)') + h.ax_joint.set_xlim(-180, 180) + h.ax_joint.set_ylim(-180, 180) + h.ax_joint.xaxis.set_major_locator(ticker.MultipleLocator(60)) + h.ax_joint.yaxis.set_major_locator(ticker.MultipleLocator(60)) + plt.savefig(outputfile2, format=image_format, bbox_inches='tight') + + +def get_base64_encoded_image(image_path): + """ encode image to string for use in html later""" + with open(image_path, "rb") as img_file: + return base64.b64encode(img_file.read()).decode('utf-8') + + +plots = [] +if args.igroupby is not None: + for k, v in group_selections.items(): + print(k, v) + try: + ramachandran_plot(v, str(k), "ramachandran1" + + str(k), "ramachandran2" + str(k)) + plots.append({'Name': "%s" % (k), 'plot1': + get_base64_encoded_image("ramachandran1" + str(k)), + 'plot2': get_base64_encoded_image("ramachandran2" + + str(k))}) + except Exception as einstance: + print(type(einstance)) + print(einstance.args) + print(einstance) + +ramachandran_plot(r, selection, args.o_plot1, args.o_plot2) +plots.insert(0, {'Name': selection, 'plot1': get_base64_encoded_image( + args.o_plot1), 'plot2': get_base64_encoded_image(args.o_plot2)}) + +template_environment = Environment(loader=FileSystemLoader( + currentpath), lstrip_blocks=True, trim_blocks=True) +template = template_environment.get_template( + 'ramachandran_auto_protein_html.j2') +with open(args.o_html1, 'w+') as f: + f.write(template.render(title="Ramachandran Plots", plots=plots))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ramachandran_auto_protein_html.j2 Mon Aug 24 16:37:11 2020 -0400 @@ -0,0 +1,25 @@ +<html> + +<head> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> + <title>{{ title }}</title> +</head> + +<body> + <table> + <tr> + <th>Selection</th> + <th>Ramachandran scatter plot</th> + <th>Ramachandran histogram </th> + </tr> + {% for plot in plots %} + <tr> + <td>{{ plot['Name'] }}</td> + <td style="vertical-align:center"><img src="data:image/png;base64,{{plot['plot1']}}" /> </td> + <td style="vertical-align:center"><img src="data:image/png;base64,{{plot['plot2']}}" /> </td> + </tr> + {% endfor %} + </table> +</body> + +</html>
--- a/ramachandran_plots.py Wed May 20 13:03:58 2020 -0400 +++ b/ramachandran_plots.py Mon Aug 24 16:37:11 2020 -0400 @@ -10,11 +10,16 @@ import matplotlib import matplotlib.pyplot as plt +import matplotlib.ticker as ticker + import numpy as np import seaborn as sns + +import yaml + matplotlib.use('Agg') # noqa @@ -24,30 +29,7 @@ parser.add_argument('--istr', help='input str') parser.add_argument('--itrajext', help='input traj ext') parser.add_argument('--istrext', help='input str ext') - parser.add_argument('--isegid1', help='segid 1') - parser.add_argument('--iresid1', help='resid 1') - parser.add_argument('--iname1', help='name 1') - parser.add_argument('--isegid2', help='segid 2') - parser.add_argument('--iresid2', help='resid 2') - parser.add_argument('--iname2', help='name 2') - parser.add_argument('--isegid3', help='segid 3') - parser.add_argument('--iresid3', help='resid 3') - parser.add_argument('--iname3', help='name 3') - parser.add_argument('--isegid4', help='segid 4') - parser.add_argument('--iresid4', help='resid 4') - parser.add_argument('--iname4', help='name 4') - parser.add_argument('--isegid5', help='segid 1') - parser.add_argument('--iresid5', help='resid 1') - parser.add_argument('--iname5', help='name 1') - parser.add_argument('--isegid6', help='segid 2') - parser.add_argument('--iresid6', help='resid 2') - parser.add_argument('--iname6', help='name 2') - parser.add_argument('--isegid7', help='segid 3') - parser.add_argument('--iresid7', help='resid 3') - parser.add_argument('--iname7', help='name 3') - parser.add_argument('--isegid8', help='segid 4') - parser.add_argument('--iresid8', help='resid 4') - parser.add_argument('--iname8', help='name 4') + parser.add_argument('--iyml', help='input in yml format') parser.add_argument('--output', help='output') parser.add_argument('--oramachandran_plot', help='dihedral plot') return parser.parse_args() @@ -55,35 +37,35 @@ args = parse_command_line(sys.argv) +with open(args.iyml) as file: + params = yaml.load(file, Loader=yaml.FullLoader) + Dihedral = namedtuple( 'Dihedral', ['atom1', 'atom2', 'atom3', 'atom4']) +for k, v in params.items(): + for a in ['phi', 'psi']: + assert (a in v), "Key %s is missing in inputs: %s " % (a, k) + atoms = [] + for b in ['atom1', 'atom2', 'atom3', 'atom4']: + assert (b in v[a]), "Key %s is missing in inputs: %s %s" % ( + b, k, a) + for c in ['segid', 'resid', 'name']: + assert (c in v[a][b]), \ + "Key %s is missing in inputs: %s %s %s " % (c, k, a, b) + atoms.append("(segid %s and resid %s and name %s)" % + (v[a][b]['segid'], v[a][b]['resid'], v[a][b]['name'])) + print(atoms) + if a == 'phi': + dihe_phi = Dihedral(atoms[0], atoms[1], atoms[2], atoms[3]) + if a == 'psi': + dihe_psi = Dihedral(atoms[0], atoms[1], atoms[2], atoms[3]) + # order of dihedral atom is the crystallographic definition # (see glycanstructure.org) -# phi -atom1 = "(segid %s and resid %s and name %s)" % \ - (args.isegid1, args.iresid1, args.iname1) -atom2 = "(segid %s and resid %s and name %s)" % \ - (args.isegid2, args.iresid2, args.iname2) -atom3 = "(segid %s and resid %s and name %s)" % \ - (args.isegid3, args.iresid3, args.iname3) -atom4 = "(segid %s and resid %s and name %s)" % \ - (args.isegid4, args.iresid4, args.iname4) - -dihe_phi = Dihedral(atom1, atom2, atom3, atom4) - -# psi -atom1 = "(segid %s and resid %s and name %s)" % \ - (args.isegid5, args.iresid5, args.iname5) -atom2 = "(segid %s and resid %s and name %s)" % \ - (args.isegid6, args.iresid6, args.iname6) -atom3 = "(segid %s and resid %s and name %s)" % \ - (args.isegid7, args.iresid7, args.iname7) -atom4 = "(segid %s and resid %s and name %s)" % \ - (args.isegid8, args.iresid8, args.iname8) - -dihe_psi = Dihedral(atom1, atom2, atom3, atom4) +assert(dihe_phi), "phi dihedral doesn't exist" +assert(dihe_psi), "psi dihedral doesn't exist" def calc_torsion(dihedral): @@ -120,8 +102,11 @@ writer.writerows(zip(phi_frame, phi_series, psi_series)) with sns.axes_style("white"): - h = sns.jointplot(x=phi_series, y=psi_series, kind="kde", legend=True) - h.set_axis_labels(r'$\Phi$ (degrees)', r'$\Psi$ (degrees)') + h = sns.jointplot(x=phi_series, y=psi_series, + kind="kde", space=0, legend=True) + h.set_axis_labels(r'$\phi$ (degrees)', r'$\psi$ (degrees)') h.ax_joint.set_xlim(-180, 180) h.ax_joint.set_ylim(-180, 180) - plt.savefig(args.oramachandran_plot, format='png') + h.ax_joint.xaxis.set_major_locator(ticker.MultipleLocator(60)) + h.ax_joint.yaxis.set_major_locator(ticker.MultipleLocator(60)) + plt.savefig(args.oramachandran_plot, format='png', bbox_inches='tight')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test.yml Mon Aug 24 16:37:11 2020 -0400 @@ -0,0 +1,36 @@ +ramachandran1: + phi: + atom1: + segid: HET + resid: 3 + name: O5 + atom2: + segid: HET + resid: 3 + name: C1 + atom3: + segid: HET + resid: 2 + name: O4 + atom4: + segid: HET + resid: 2 + name: C4 + psi: + atom1: + segid: HET + resid: 3 + name: C1 + atom2: + segid: HET + resid: 2 + name: O4 + atom3: + segid: HET + resid: 2 + name: C4 + atom4: + segid: HET + resid: 2 + name: C5 + comment: pick visually using VMD using labels. Go to labels, dihedral to see the information about resname resid and atomname and then lookup the segname for ach atom.