Mercurial > repos > bgruening > xchem_transfs_scoring

--- a/transfs.py	Wed Apr 08 07:28:49 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,322 +0,0 @@
-# Create dir containing ligands.sdf and protein.pdb
-# Enter docker container like this:
-#   docker run -it --rm --gpus all -v $PWD:/root/train/fragalysis_test_files/work:Z informaticsmatters/deep-app-ubuntu-1604:latest bash
-#
-# Now inside the container run like this:
-#   mkdir /tmp/work
-#   rm -rf /tmp/work/* && python3 work/transfs.py -i work/test-data/ligands.sdf -r work/test-data/receptor.pdb -d 2 -w /tmp/work
-#
-# If testing with no GPU you can use the --mock option to generate random scores
-#
-# Start container for testing like this:
-#    docker run -it --rm -v $PWD:$PWD:Z -w $PWD informaticsmatters/deep-app-ubuntu-1604:latest bash
-# Inside container test like this:
-#   mkdir /tmp/work
-#   cd chemicaltoolbox/xchem-deep
-#   rm -rf /tmp/work/* && python3 transfs.py -i test-data/ligands.sdf -r test-data/receptor.pdb -d 2 -w /tmp/work --mock
-#
-
-import argparse, os, sys, math
-import subprocess
-import random
-from openbabel import pybel
-
-types_file_name = 'inputs.types'
-types_file_name = 'inputs.types'
-predict_file_name = 'predictions.txt'
-work_dir = '.'
-paths = None
-inputs_protein = []
-inputs_ligands = []
-
-
-def log(*args, **kwargs):
-    """Log output to STDERR
-    """
-    print(*args, file=sys.stderr, ** kwargs)
-
-def write_raw_inputs(receptor_pdb, ligands_sdf, distance):
-    """
-    Analyses the PDB file for waters that clash with each ligand in the SDF and writes out:
-    1. a PDB file named like receptor-123-543.pdb where the numeric parts are the waters that have been omitted
-    2. a corresponding directory named like receptor-123-543
-    3. an SDF named like receptor-123-543/ligands.sdf containing those ligands that correspond to that receptor.
-    :param receptor_pdb: A PDB file without the ligand but with the crystallographic waters
-    :param ligands_sdf: A SDF with the docked poses
-    :param distance: The distance to consider when removing waters. Only heavy atoms in the ligand are considered.
-    :return:
-    """
-
-    global work_dir
-    global inputs_protein
-    global inputs_ligands
-    global paths
-
-
-    log("Writing data to", work_dir)
-    if not os.path.isdir(work_dir):
-        os.mkdir(work_dir)
-
-    receptor_file = os.path.basename(receptor_pdb)
-
-    sdf_writers = {}
-    paths = []
-
-    # read the receptor once as we'll need to process it many times
-    with open(receptor_pdb, 'r') as f:
-        lines = f.readlines()
-
-    count = 0
-    for mol in pybel.readfile("sdf", ligands_sdf):
-        count += 1
-        if count % 50000 == 0:
-            log('Processed', count)
-
-        try:
-            # print("Processing mol", mol.title)
-
-            clone = pybel.Molecule(mol)
-            clone.removeh()
-
-            coords = []
-            for atom in clone.atoms:
-                coords.append(atom.coords)
-
-            watnumcode = ''
-
-            # getting receptor without waters that will clash with ligand
-            new_receptor_pdb = []
-            for line in lines:
-                if line[17:20] == 'HOH':
-                    x, y, z = float(line[30:39]),  float(line[39:46]), float(line[46:55])
-                    distances = []
-                    for i in coords:
-                        distances.append(math.sqrt((x-i[0])**2 + (y-i[1])**2 + (z-i[2])**2))  # calculates distance based on cartesian coordinates
-                    if min(distances) > distance: # if all distances are larger than 2.0A, then molecule makes it to new file
-                        new_receptor_pdb.append(line)
-                    else:
-                        watnum = line[23:28].strip()
-                        # print("Skipped water " + watnum)
-                        watnumcode += '-' + watnum
-                if line[17:20] != 'LIG' and line[17:20] != 'HOH':  # ligand lines are also removed
-                    new_receptor_pdb.append(line)
-
-
-            name = receptor_file[0:-4] + watnumcode
-            # print('CODE:', name)
-            mol.data['TransFSReceptor'] = name
-
-
-            if watnumcode not in sdf_writers:
-                # we've not yet encountered this combination of waters so need to write the PDB file
-
-                dir = os.path.sep.join([work_dir, name])
-                log('WRITING to :', dir)
-
-                os.mkdir(dir)
-                paths.append(dir)
-
-                sdf_writers[watnumcode] = pybel.Outputfile("sdf", os.path.sep.join([dir, 'ligands.sdf']))
-
-                # writing into new pdb file
-                receptor_writer = open(os.path.sep.join([work_dir, name + '.pdb']), "w+")
-                for line in new_receptor_pdb:
-                    receptor_writer.write(str(line))
-                receptor_writer.close()
-
-            # write the molecule to the corresponding SDF file
-            sdf_out = sdf_writers[watnumcode]
-            sdf_out.write(mol)
-
-        except Exception as e:
-            log('Failed to handle molecule: '+ str(e))
-            continue
-
-    for writer in sdf_writers.values():
-        writer.close()
-
-    log('Wrote', count, 'molecules and', len(sdf_writers), 'proteins')
-
-def write_inputs(protein_file, ligands_file, distance):
-    """
-    Runs gninatyper on the proteins and ligands and generates the input.types file that will tell the predictor
-    what ligands correspond to what proteins.
-
-    :param protein_file:
-    :param ligands_file:
-    :param distance:
-    :return:
-    """
-
-    global types_file_name
-    global work_dir
-    global inputs_protein
-    global inputs_ligands
-    global prepared_ligands
-
-    write_raw_inputs(protein_file, ligands_file, distance)
-
-    types_path = os.path.sep.join([work_dir, types_file_name])
-    log("Writing types to", types_path)
-    with open(types_path, 'w') as types_file:
-
-        for path in paths:
-
-            log("Gninatyping ligands in", path)
-
-            ligands_dir = os.path.sep.join([path, 'ligands'])
-            os.mkdir(ligands_dir)
-
-            cmd1 = ['gninatyper', os.path.sep.join([path, 'ligands.sdf']), os.path.sep.join([ligands_dir, 'ligand'])]
-            log('CMD:', cmd1)
-            exit_code = subprocess.call(cmd1)
-            log("Status:", exit_code)
-            if exit_code:
-                raise Exception("Failed to write ligands")
-            ligand_gninatypes = os.listdir(os.path.sep.join([path, 'ligands']))
-
-            log("Gninatyping proteins in", path)
-
-            proteins_dir = os.path.sep.join([path, 'proteins'])
-            os.mkdir(proteins_dir)
-
-            cmd2 = ['gninatyper', path + '.pdb', os.path.sep.join([proteins_dir, 'protein'])]
-            log('CMD:', cmd2)
-            exit_code = subprocess.call(cmd2)
-            log("Status:", exit_code)
-            if exit_code:
-                raise Exception("Failed to write proteins")
-            protein_gninatypes = os.listdir(os.path.sep.join([path, 'proteins']))
-
-            num_proteins = 0
-            num_ligands = 0
-
-            for protein in protein_gninatypes:
-                num_proteins += 1
-                num_ligands = 0
-                inputs_protein.append(protein)
-                inputs_protein.append(os.path.sep.join([path, 'proteins', protein]))
-                for ligand in ligand_gninatypes:
-                    num_ligands += 1
-                    log("Handling", protein, ligand)
-                    inputs_ligands.append(os.path.sep.join([path, 'ligands', ligand]))
-                    line = "0 {0}{3}proteins{3}{1} {0}{3}ligands{3}{2}\n".format(path, protein, ligand, os.path.sep)
-                    types_file.write(line)
-
-    return num_proteins, num_ligands
-
-
-def generate_predictions_filename(work_dir, predict_file_name):
-    return "{0}{1}{2}".format(work_dir, os.path.sep, predict_file_name)
-
-
-def run_predictions(model):
-    global types_file_name
-    global predict_file_name
-    global work_dir
-    # python3 scripts/predict.py -m resources/dense.prototxt -w resources/weights.caffemodel -i work_0/test_set.types >> work_0/caffe_output/predictions.txt
-    cmd1 = ['python3', '/train/fragalysis_test_files/scripts/predict.py',
-            '-m', '/train/fragalysis_test_files/resources/dense.prototxt',
-            '-w', os.path.sep.join(['/train/fragalysis_test_files/resources', model]),
-            '-i', os.path.sep.join([work_dir, types_file_name]),
-            '-o', os.path.sep.join([work_dir, predict_file_name])]
-    log("CMD:", cmd1)
-    subprocess.call(cmd1)
-
-
-def mock_predictions():
-    global work_dir
-    global predict_file_name
-
-    log("WARNING: generating mock results instead of running on GPU")
-    outfile = generate_predictions_filename(work_dir, predict_file_name)
-    count = 0
-    with open(outfile, 'w') as predictions:
-        for path in paths:
-            log("Reading", path)
-            protein_gninatypes = os.listdir(os.path.sep.join([path, 'proteins']))
-            ligand_gninatypes = os.listdir(os.path.sep.join([path, 'ligands']))
-            for protein in protein_gninatypes:
-                for ligand in ligand_gninatypes:
-                    count += 1
-                    score = random.random()
-                    line = "{0} | 0 {1}{4}proteins{4}{2} {1}{4}ligands{4}{3}\n".format(score, path, protein, ligand,
-                                                                                       os.path.sep)
-                    # log("Writing", line)
-                    predictions.write(line)
-
-    log('Wrote', count, 'mock predictions')
-
-
-def read_predictions():
-    global predict_file_name
-    global work_dir
-    scores = {}
-    with open("{0}{1}{2}".format(work_dir, os.path.sep, predict_file_name), 'r') as input:
-        for line in input:
-            # log(line)
-            tokens = line.split()
-            if len(tokens) == 5 and tokens[1] == '|':
-                # log(len(tokens), tokens[0], tokens[3], tokens[4])
-                record_no = inputs_ligands.index(tokens[4])
-                if record_no is not None:
-                    # log(record_no, tokens[0])
-                    scores[record_no] = tokens[0]
-    log("Found", len(scores), "scores")
-    return scores
-
-
-def patch_scores_sdf(outfile, scores):
-
-    counter = 0
-    sdf_path = "{0}{1}{2}".format(work_dir, os.path.sep, outfile)
-    log("Writing results to {0}".format(sdf_path))
-    sdf_file = pybel.Outputfile("sdf", sdf_path)
-
-    for path in paths:
-        for mol in pybel.readfile("sdf", os.path.sep.join([path, 'ligands.sdf'])):
-            if counter in scores:
-                score = scores[counter]
-                # og("Score for record {0} is {1}".format(counter, score))
-                mol.data['TransFSScore'] = score
-                sdf_file.write(mol)
-            else:
-                log("No score found for record", counter)
-            counter += 1
-    sdf_file.close()
-
-
-def execute(ligands_sdf, protein, outfile, distance, model='weights.caffemodel', mock=False):
-
-    write_inputs(protein, ligands_sdf, distance)
-    if mock:
-        mock_predictions()
-    else:
-        run_predictions(model)
-    scores = read_predictions()
-    patch_scores_sdf(outfile, scores)
-
-
-def main():
-    global work_dir
-
-    parser = argparse.ArgumentParser(description='XChem deep - pose scoring')
-
-    parser.add_argument('-i', '--input', help="SDF containing the poses to score)")
-    parser.add_argument('-r', '--receptor', help="Receptor file for scoring (PDB format)")
-    parser.add_argument('-d', '--distance', type=float, default=2.0, help="Cuttoff for removing waters")
-    parser.add_argument('-o', '--outfile', default='output.sdf', help="File name for results")
-    parser.add_argument('-w', '--work-dir', default=".", help="Working directory")
-    parser.add_argument('-m', '--model', default="weights.caffemodel", help="Model to use for predictions")
-    parser.add_argument('--mock', action='store_true', help='Generate mock scores rather than run on GPU')
-
-    args = parser.parse_args()
-    log("XChem deep args: ", args)
-
-    work_dir = args.work_dir
-
-    execute(args.input, args.receptor, args.outfile, args.distance, model=args.model, mock=args.mock)
-
-
-if __name__ == "__main__":
-    main()
--- a/transfs.xml	Wed Apr 08 07:28:49 2020 -0400
+++ b/transfs.xml	Mon Apr 13 10:54:08 2020 -0400
@@ -1,49 +1,42 @@
-<tool id="xchem_transfs_scoring" name="XChem TransFS pose scoring" version="0.3.0">
+<tool id="xchem_transfs_scoring" name="XChem TransFS pose scoring" version="0.4.0">
     <description>using deep learning</description>

     <requirements>
         <!--requirement type="package" version="3.0.0">openbabel</requirement-->
         <!--requirement type="package" version="3.7">python</requirement-->
         <!-- many other requirements are needed -->
-        <container type="docker">informaticsmatters/deep-app-ubuntu-1604:1.2</container>
+        <container type="docker">informaticsmatters/transfs:1.3</container>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[

     cd /train/fragalysis_test_files/ &&
-    mkdir workdir &&
+    mkdir -p workdir/tfs &&
     cd workdir &&

     cp '$ligands' ligands.sdf &&
     cp '$receptor' receptor.pdb &&
-
-    ##mkdir -p /root/train/ &&
-    ##ln -s /train/fragalysis_test_files/ /root/train/ &&

-    ##adduser centos --uid 1000 --quiet --no-create-home --system &&
-    ##apt install sudo -y &&
-
-    ## mkdir -p ligands &&
     cd ../ &&
-    python '$__tool_directory__/transfs.py' -i ./workdir/ligands.sdf -r ./workdir/receptor.pdb -d $distance -w /train/fragalysis_test_files/workdir --model '$model' &&
+    python transfs.py -i /train/fragalysis_test_files/workdir/ligands.sdf -r /train/fragalysis_test_files/workdir/receptor.pdb -d $distance -w workdir/tfs --model '$model' '$mock' &&
     ls -l &&
     ls -l workdir &&
-    sudo -u ubuntu cp ./workdir/output.sdf '$output' &&
-    head -n 10000 ./workdir/output.sdf &&
+    cp ./workdir/tfs/output.sdf '$output' &&
+    head -n 1000 ./workdir/tfs/output.sdf &&

     mkdir -p ./pdb &&
-    cp -r ./workdir/receptor*.pdb ./pdb &&
+    cp -r ./workdir/tfs/receptor*.pdb ./pdb &&
     tar -cvhf archiv.tar ./pdb &&
-    sudo -u ubuntu cp archiv.tar '$output_receptors' &&
+    cp archiv.tar '$output_receptors' &&

-    sudo -u ubuntu cp ./workdir/predictions.txt '$predictions'
-
+    cp ./workdir/tfs/predictions.txt '$predictions'

     ]]></command>

     <inputs>
-        <param type="data" name="receptor" format="pdb" label="Receptor" help="Select a receptor (pdb format)."/>
+        <param type="data" name="receptor" format="pdb" label="Receptor" help="Select a receptor (PDB format)."/>
         <param type="data" name="ligands" format="sdf,mol" label="Ligands" help="Ligands (docked poses) in SDF format)"/>
-        <param name="distance" type="float" value="2.0" min="1.0" max="5.0" label="Distance to waters" help="Remove waters closer than this distance to any ligand heavy atom"/>
+        <param name="distance" type="float" value="0" min="0" max="5.0" label="Distance to waters"
+               help="Remove waters closer than this distance to any ligand heavy atom (0 means skip this process)."/>
         <param name="model" type="select" label="Model for predictions">
             <option value="weights.caffemodel">No threshold (original model)</option>
             <option value="10nm.0_iter_50000.caffemodel">10nM threshold</option>
@@ -63,7 +56,7 @@
     </outputs>

     <tests>
-	    <test>
+        <test>
             <param name="receptor" value="receptor.pdb"/>
             <param name="ligands" value="ligands.sdf"/>
             <param name="mock" value="--mock" />
@@ -73,8 +66,21 @@
                     <has_text text="TransFSReceptor"/>
                     <has_text text="TransFSScore"/>
                 </assert_contents>
+            </output> -->
+            <!---output_collection name="pdb_files" type="list" count="2" /-->
+        </test>
+        <test>
+            <param name="receptor" value="receptor.pdb"/>
+            <param name="ligands" value="ligands.sdf"/>
+            <param name="mock" value="--mock" />
+            <param name="distance" value="0"/>
+            <output name="output" ftype="sdf">
+                <assert_contents>
+                    <not_has_text text="TransFSReceptor"/>
+                    <has_text text="TransFSScore"/>
+                </assert_contents>
             </output>
-            <!--output_collection name="pdb_files" type="list" count="2" /-->
+            <!--output_collection name="pdb_files" type="list" count="1" /-->
         </test>
     </tests>
     <help><![CDATA[
@@ -92,8 +98,23 @@

 **Inputs**

-1. The protein receptor to dock into as a file in PDB format. This should have the ligand removed but retain the waters.
-2. A set of ligand poses to score in SDF format.
+1. The protein receptor to dock into as a file in PDB format. This should have the ligand removed but can
+   retain the waters. This is specified by the 'receptor' parameter.
+2. A set of ligand poses to score in SDF format. This is specified by the 'ligands' parameter.
+
+Other parameters:
+
+'distance': Distance in Angstroms. Waters closer than this to any heavy atom in each ligand are removed.
+  A discrete set of PDB files are created missing certain waters and the ligands corresponding to that PDB file are
+  scored against it. If a distance of zero is specified then this process is skipped and all ligands are scored against
+  the receptor 'as is'. The assumption is that all waters have already been removed in this case. Specifying a
+  distance of 0 provides a significant boost in performance as the water checking step is avoided..
+
+'model': A number of models are provided:
+- weights.caffemodel - No threshold for distinction of actives and inactives (original model)
+- 10nm.0_iter_50000.caffemodel: actives are molecules from DUDE that have better than 10nM activity
+- 50nm.0_iter_50000.caffemodel: actives are molecules from DUDE that have better than 50nM activity
+- 200nm.0_iter_50000.caffemodel: actives are molecules from DUDE that have better than 200nM activity

 -----

@@ -101,14 +122,16 @@

 **Outputs**

-An SDF file is produced as output. The binding affinity scores are contained within the SDF file
+An SDF file is produced as output. The predicted scores are contained within the SDF file
 as the TransFSScore property and the PDB file (with the waters that clash with the ligand removed)
 that was used for the scoring as the TransFSReceptor property.
 Values for the score range from 0 (poor binding) to 1 (good binding).

+The raw predictions (predictions.txt) is also provided as an output.
+
 A set of PDB files is also output, each one with different crystallographic waters removed. Each ligand is
 examined against input PDB structure and the with waters that clash (any heavy atom of the ligand closer than
-the 'distance' parameter being removed. The filenames are encoded with the water numbers that are removed.
+the 'distance' parameter being removed. The file names are encoded with the water numbers that are removed.

     ]]></help>
 </tool>