# HG changeset patch # User yufei-luo # Date 1358437934 18000 # Node ID e0f8dcca02ed006bbb379fb4ce53c8ccee496637 Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data. diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/README.pdf Binary file smart_toolShed/README.pdf has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/File.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/File.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,55 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +public class File { + String name; + String formatType; + String format; + + + public File(String name, String type, String format) { + this.name = name; + this.formatType = type; + this.format = format; + } + + public String getName() { + return this.name; + } + + public String getFormatType() { + return this.formatType; + } + + public String getFormat() { + return this.format; + } +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Files.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Files.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,75 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +import java.util.*; + +public class Files { + HashMap files; + + public Files () { + files = new HashMap < String, File> (); + } + + public void addFile(String fileName, String type, String format) { + this.addFile(new File(fileName, type, format)); + } + + public void addFile(File file) { + files.put(file.name, file); + } + + public void clear() { + files.clear(); + } + + public String getType(String fileName) { + if (fileName == null) { + System.out.println("Error! Looking for format of empty file name!"); + } + if (! files.containsKey(fileName)) { + System.out.println("Oops! Format type of file " + fileName + " is not found!"); + return null; + } + return files.get(fileName).formatType; + } + + public String getFormat(String fileName) { + if (fileName == null) { + System.out.println("Error! Looking for format of empty file name!"); + } + if (! files.containsKey(fileName)) { + System.out.println("Oops! Format of file " + fileName + " is not found!"); + return null; + } + return files.get(fileName).format; + } +} + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/FormatType.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/FormatType.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,64 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +import java.util.*; + +public class FormatType { + String type; + Vector < String > formats; + + public FormatType (String type) { + this.type = type; + this.formats = new Vector < String > (); + } + + public String getType () { + return this.type; + } + + public void addFormat (String format) { + formats.add(format); + } + + public boolean containsFormat (String format) { + for (int i = 0; i < formats.size(); i++) { + if (((String) formats.get(i)).compareToIgnoreCase(format) == 0) { + return true; + } + } + return false; + } + + public Vector < String > getFormats () { + return formats; + } +} + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/FormatsContainer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/FormatsContainer.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,90 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +import java.util.*; + +public class FormatsContainer { + + HashMap < String, FormatType > formatTypes; + + + public FormatsContainer() { + this.formatTypes = new HashMap < String, FormatType > (); + } + + + public void addFormat(String type, String format) { + FormatType formatType; + if (formatTypes.containsKey(type)) { + formatType = this.formatTypes.get(type); + } + else { + formatType = new FormatType(type); + this.formatTypes.put(type, formatType); + } + formatType.addFormat(format); + } + + + public Vector < String > getFormatTypes () { + Vector < String > v = new Vector < String > (); + v.addAll(this.formatTypes.keySet()); + return v; + } + + + public FormatType getFormats (String type) { + if (! formatTypes.containsKey(type)) { + System.out.print("Format type " + type + " is unavailable. Got: "); + Iterator it = formatTypes.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry) it.next(); + System.out.print(pairs.getKey() + " "); + } + System.out.println(); + } + return formatTypes.get(type); + } + + + public String getFormatType (String format) { + for (Iterator it = formatTypes.keySet().iterator(); it.hasNext(); ) { + Object type = it.next(); + Object formatType = formatTypes.get(type); + if (((FormatType) formatType).containsFormat(format)) { + return (String) type; + } + } + return null; + } +} + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/FormatsReader.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/FormatsReader.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,83 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +import java.util.*; +import java.io.File; +import java.io.*; + + +public class FormatsReader { + + String fileName; + Vector < FormatType > formatTypes; + Vector < String > typeNames; + + + public FormatsReader(String fileName) { + this.fileName = fileName; + this.formatTypes = new Vector < FormatType > (); + } + + + public boolean read() { + File file = new File(this.fileName); + + try { + BufferedReader reader = new BufferedReader(new FileReader(file)); + String line = null; + String[] lineElements; + String[] formats; + String typeName; + + while ((line = reader.readLine()) != null) { + if (line.length() > 0) { + lineElements = line.split(":"); + typeName = lineElements[0].trim(); + formats = lineElements[1].split(","); + for (int i = 0; i < formats.length; i++) { + Global.formats.addFormat(typeName, formats[i].trim()); + } + } + } + + reader.close(); + } + catch (FileNotFoundException e) { + return false; + } + catch (IOException e) { + return false; + } + + return true; + } +} + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Global.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Global.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,70 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +import java.util.Vector; +import java.util.HashMap; +import javax.swing.DefaultListModel; +import javax.swing.JButton; +import javax.swing.JTextField; + +public class Global { + + public static int logAreaSize = 100; + + public static String smartConfFileName = "smart.conf"; + + public static String smartProgramsFileName = "programs.txt"; + + public static String smartFormatsFileName = "formats.txt"; + + public static String pythonPath = new String(); + + public static String pythonCommand = "python"; + + public static String mysqlCommand = "mysql"; + + public static String rCommand = "R"; + + public static Files files = new Files(); + + public static Vector < String > fileNames = new Vector < String >(); + + public static FormatsContainer formats = new FormatsContainer(); + + public static boolean programRunning = false; + + public static HashMap < JButton, JTextField > otherFilesChooser = new HashMap < JButton, JTextField >(); + + public static HashMap < JButton, JTextField > otherDirectoriesChooser = new HashMap < JButton, JTextField >(); + + public static HashMap < JButton, JTextField > otherFileConcatenationChooser = new HashMap < JButton, JTextField >(); + +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/Old/PasswordAsker.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Installer/Old/PasswordAsker.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,87 @@ +import java.awt.*; +import java.awt.event.*; +import javax.swing.*; +import java.util.concurrent.CountDownLatch; + +public class PasswordAsker { + + static String password; + static JFrame frame; + static CountDownLatch latch; + + + public PasswordAsker() { + password = null; + javax.swing.SwingUtilities.invokeLater(new Runnable() { + public void run() { + createAndShowGUI(); + } + }); + latch = new CountDownLatch(1); + } + + + private static void createAndShowGUI() { + //Create and set up the window. + frame = new JFrame("Password"); + frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + frame.setContentPane(setMainPane()); + + //Display the window. + frame.pack(); + frame.setVisible(true); + } + + + private static JPanel setMainPane() { + JPanel rootPanel = new JPanel(false); + rootPanel.setLayout(new GridLayout(0, 1)); + + JPanel infoPanel = new JPanel(false); + JLabel infoLabel = new JLabel("Please write here the password that you entered for the mySQL root account.\r\nNo information is stored nor sent. I promise."); + infoPanel.add(infoLabel); + + JPanel passPanel = new JPanel(false); + passPanel.setLayout(new GridLayout(1, 0)); + JLabel passLabel = new JLabel("password"); + final JTextField passText = new JTextField(20); + passLabel.setLabelFor(passText); + passPanel.add(passLabel); + passPanel.add(passText); + + JPanel okPanel = new JPanel(false); + JButton okButton = new JButton("OK"); + okPanel.add(okButton); + + okButton.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + password = passText.getText(); + frame.setVisible(false); + frame.dispose(); + latch.countDown(); + } + }); + + rootPanel.add(infoPanel); + rootPanel.add(passPanel); + rootPanel.add(okPanel); + + return rootPanel; + } + + + public boolean waitForPassword() { + try { + latch.await(); + } + catch (InterruptedException e) { + return false; + } + return true; + } + + + public String getPassword() { + return password; + } +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/Old/SmartInstaller.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Installer/Old/SmartInstaller.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,167 @@ +import java.util.*; +import java.awt.*; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.io.*; +import javax.swing.*; +import javax.swing.filechooser.*; +import javax.swing.border.*; +import javax.swing.SwingUtilities; +import java.net.*; + +public class SmartInstaller extends JPanel implements ActionListener { + int BUFFER = 1024; + + JFrame mainFrame; + JTextArea logArea; + + // configuration chooser buttons + String configurations[] = {"32 bits", "64 bits"}; + JRadioButton configurationButtons[]; + + // program chooser buttons + String programChoosers[] = {"R", "R Color Brewer Package", "R HMisc Package", "MySQL", "MySQL account", "Python 2.6", "Python DB", "S-MART"}; + JCheckBox programChooserButtons[]; + + JButton goButton; + + // install directory + JButton installDirectoryChooserButton; + JTextField installDirectoryChooserTextField; + + + public SmartInstaller() { + super(); + + Box box = Box.createVerticalBox(); + + // Header + JPanel headerPanel = new JPanel(false); + JTextArea headerArea = new JTextArea("This is the S-MART installation tool.\r\nIt will download and install the needed softwares, as well as S-MART itself.\r\nYou can unselect the software that you already have installed.\r\nDuring the installation, accept all the default parameters.\r\nPlease remember the root password if you install MySQL!"); + TitledBorder headerBorder = BorderFactory.createTitledBorder("Wellcome to the S-MART installer!"); + headerArea.setEditable(false); + headerArea.setBackground(headerPanel.getBackground()); + headerPanel.add(headerArea); + headerPanel.setBorder(headerBorder); + + + // Configuration + JPanel configurationPanel = new JPanel(false); + configurationPanel.setLayout(new GridLayout(1, 0)); + configurationButtons = new JRadioButton[configurations.length]; + ButtonGroup configurationGroup = new ButtonGroup(); + for (int i = 0; i < configurations.length; i++) { + JRadioButton button = new JRadioButton(configurations[i]); + configurationPanel.add(button); + configurationButtons[i] = button; + configurationGroup.add(button); + } + configurationButtons[0].setSelected(true); + TitledBorder configurationBorder = BorderFactory.createTitledBorder("Configuration"); + configurationPanel.setBorder(configurationBorder); + + + // Program chooser panel + JPanel programPanel = new JPanel(false); + programPanel.setLayout(new GridLayout(0, 1)); + + JLabel programLabel = new JLabel("Choose which programs to install:"); + programPanel.add(programLabel); + programChooserButtons = new JCheckBox[programChoosers.length]; + for (int i = 0; i < programChoosers.length; i++) { + JCheckBox button = new JCheckBox(programChoosers[i]); + button.setSelected(true); + programPanel.add(button); + programChooserButtons[i] = button; + } + TitledBorder programBorder = BorderFactory.createTitledBorder("Programs"); + programPanel.setBorder(programBorder); + + // Install directory chooser + JPanel installDirectoryChooserPanel = new JPanel(false); + installDirectoryChooserPanel.setLayout(new GridLayout(1, 0)); + JLabel installDirectoryChooserLabel = new JLabel("Choose a directory to install S-MART: "); + installDirectoryChooserTextField = new JTextField(); + installDirectoryChooserButton = new JButton("Open..."); + installDirectoryChooserButton.addActionListener(this); + + installDirectoryChooserPanel.add(installDirectoryChooserLabel); + installDirectoryChooserPanel.add(installDirectoryChooserTextField); + installDirectoryChooserPanel.add(installDirectoryChooserButton); + TitledBorder installDirectoryChooserBorder = BorderFactory.createTitledBorder("Installation directory"); + installDirectoryChooserPanel.setBorder(installDirectoryChooserBorder); + + // GO! + JPanel goPanel = new JPanel(false); + goButton = new JButton("GO!"); + goButton.addActionListener(this); + goButton.setSelected(true); + goPanel.add(goButton); + TitledBorder goBorder = BorderFactory.createTitledBorder("Start install"); + goPanel.setBorder(goBorder); + + // Log + logArea = new JTextArea(10, 120); + logArea.setFont(new Font("Monospaced", logArea.getFont().getStyle(), logArea.getFont().getSize())); + JScrollPane logScroll = new JScrollPane(logArea, JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED); + TitledBorder logBorder = BorderFactory.createTitledBorder("Log"); + logScroll.setBorder(logBorder); + + GridLayout horizontalLayout = new GridLayout(1, 0); + + box.add(headerPanel); + box.add(configurationPanel); + box.add(programPanel); + box.add(installDirectoryChooserPanel); + box.add(goPanel); + box.add(logScroll); + + add(box); + } + + + public void actionPerformed(ActionEvent e) { + + // Install directories chooser + if (e.getSource() == goButton) { + boolean[] selectedPrograms = new boolean[programChoosers.length]; + for (int i = 0; i < programChoosers.length; i++) { + selectedPrograms[i] = programChooserButtons[i].isSelected(); + } + SmartInstallerTask task = new SmartInstallerTask(logArea, selectedPrograms, installDirectoryChooserTextField.getText(), (configurationButtons[0].isSelected())? 0: 1); + task.execute(); + } + // Install directories chooser + else if (e.getSource() == installDirectoryChooserButton) { + JFileChooser chooser = new JFileChooser(); + chooser.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY); + if (chooser.showOpenDialog(mainFrame) == JFileChooser.APPROVE_OPTION) { + installDirectoryChooserTextField.setText(chooser.getSelectedFile().getPath()); + } + } + } + + private static void createAndShowGUI() { + // Create and set up the window. + JFrame mainFrame = new JFrame("S-Mart Installer"); + mainFrame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + + //Create and set up the content pane. + JComponent newContentPane = new SmartInstaller(); + newContentPane.setOpaque(true); + mainFrame.setContentPane(newContentPane); + + // Display the window. + mainFrame.pack(); + mainFrame.setVisible(true); + } + + + public static void main(String[] args) { + javax.swing.SwingUtilities.invokeLater(new Runnable() { + public void run() { + createAndShowGUI(); + } + }); + } +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/Old/SmartInstallerTask.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Installer/Old/SmartInstallerTask.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,455 @@ +import java.util.*; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.io.*; +import javax.swing.*; +import javax.swing.filechooser.*; +import javax.swing.border.*; +import javax.swing.SwingUtilities; +import java.net.*; +import java.util.Stack; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +public class SmartInstallerTask extends SwingWorker { + + int BUFFER = 1024; + + int architecture = 0; + String installDirectoryName = null; + JTextArea logArea = null; + boolean[] selectedPrograms = null; + + // program chooser buttons + String programChoosers[] = {"R", "R Color Brewer Package", "R HMisc Package", "MySQL", "MySQL account", "Python 2.6", "Python DB", "S-MART"}; + + // Web addresses for the tools + String packageAddresses[][] = { + {"http://cran.cict.fr/bin/windows/base/R-2.11.0-win32.exe", "http://cran.cict.fr/bin/windows64/base/R-2.11.0-win64.exe"}, + {"", ""}, + {"", ""}, + {"http://mirrors.ircam.fr/pub/mysql/Downloads/MySQL-5.1/mysql-essential-5.1.47-win32.msi", "http://mirrors.ircam.fr/pub/mysql/Downloads/MySQL-5.1/mysql-essential-5.1.47-winx64.msi"}, + {"", ""}, + {"http://www.python.org/ftp/python/2.6.5/python-2.6.5.msi", "http://www.python.org/ftp/python/2.6.5/python-2.6.5.amd64.msi"}, + {"http://www.technicalbard.com/files/MySQL-python-1.2.2.win32-py2.6.exe", "http://www.technicalbard.com/files/MySQL-python-1.2.2.win32-py2.6.exe"}, + {"http://urgi.versailles.inra.fr/download/s-mart/s-mart.zip", "http://urgi.versailles.inra.fr/download/s-mart/s-mart.zip"} + }; + + // Packages to install + String rPackages[] = {"RColorBrewer", "Hmisc"}; + + // Script lines + String scriptLines[][] = { + {"\"\\R-2.11.0-win32.exe\"", "\"\\R-2.11.0-win64.exe\""}, + {"\"\" CMD BATCH \"\\installRColorBrewer.R\"", "\"\" CMD BATCH \"\\installRColorBrewer.R\""}, + {"\"\" CMD BATCH \"\\installHmisc.R\"", "\"\" CMD BATCH \"\\installHmisc.R\""}, + {"msiexec /i \"\\mysql-essential-5.1.47-win32.msi\"", "msiexec /i \"\\mysql-essential-5.1.47-winx64.msi\""}, + {"", ""}, + {"msiexec /i \"\\python-2.6.5.msi\"", "msiexec /i \"\\python-2.6.5.amd64.msi\""}, + {"\\MySQL-python-1.2.2.win32-py2.6.exe", "\\MySQL-python-1.2.2.win32-py2.6.exe"}, + {"", ""} + }; + + // Files to uncompress + String compressedFiles[][] = { + {"", ""}, + {"", ""}, + {"", ""}, + {"", ""}, + {"", ""}, + {"", ""}, + {"", ""}, + {"\\s-mart.zip", "\\s-mart.zip"} + }; + + + public SmartInstallerTask(JTextArea ta, boolean[] b, String s, int a) { + logArea = ta; + selectedPrograms = b; + installDirectoryName = s; + architecture = a; + } + + + @Override + public Boolean doInBackground() { + boolean installOk; + publish("Starting install\n"); + writeFiles(); + for (int i = 0; i < selectedPrograms.length; i++) { + if (selectedPrograms[i]) { + if (! install(i)) { + return Boolean.FALSE; + } + } + } + removeFiles(); + setEnvironmentVariables(); + publish("Ending install\n"); + return Boolean.TRUE; + } + + + @Override + protected void process(List chunks) { + for (String chunk: chunks) { + logArea.append(chunk); + } + } + + + private boolean launch(String command) { + return realLaunch(new ProcessBuilder(command), command); + } + + private boolean launch(String[] command) { + return realLaunch(new ProcessBuilder(command), Arrays.toString(command)); + } + + private boolean realLaunch(ProcessBuilder pb, String command) { + BufferedReader outputReader; + pb = pb.redirectErrorStream(true); + Process process = null; + publish(" Starting command '" + command + "'\n"); + try { + process = pb.start(); + BufferedInputStream outputStream = new BufferedInputStream(process.getInputStream()); + InputStream is = process.getInputStream(); + InputStreamReader isr = new InputStreamReader(is); + outputReader = new BufferedReader(isr); + } + catch (Exception exception) { + publish(" !Process cannot be started (command is '" + command + "')!\n"); + exception.printStackTrace(); + return false; + } + if (outputReader == null) { + publish(" !Problem in the output of the command!\n"); + return false; + } + else { + publish(" Output is:\n"); + try { + publish(" ---\n"); + String line; + while ((line = outputReader.readLine()) != null) { + publish(" " + line + "\r\n"); + } + publish(" ---\n"); + } + catch (IOException e) { + publish(" !Cannot get the output of the command!\n"); + return false; + } + } + int exitValue = process.exitValue(); + if (exitValue != 0) { + publish(" !Problem during the execution of the command '" + command + "'!\n"); + return false; + } + publish(" Ending command '" + command + "'\n"); + return true; + } + + + private File lookForFile(String fileName, String[] putativePlaces) { + publish(" Looking for file " + fileName + "\n"); + for (String place: putativePlaces) { + File file = new File(place, fileName); + publish(" Look at " + file.getAbsolutePath() + "\n"); + if (file.exists()) { + publish(" Found it in expected place " + file.getAbsolutePath() + "\n"); + return file; + } + } + Stack files = new Stack(); + files.push(new File("\\")); + while (! files.empty()) { + File file = files.pop(); + for (File childFile: file.listFiles()) { + if (childFile.isDirectory()) { + files.push(childFile); + } + else { + if (fileName.compareToIgnoreCase(childFile.getName()) == 0) { + publish(" Found it in unexpected place " + childFile.getAbsolutePath() + "\n"); + return childFile; + } + } + } + } + publish(" !Cannot file file '" + fileName + "'!\n"); + return null; + } + + + private boolean writeFile(String fileName, String content) { + try { + FileWriter fw = new FileWriter(fileName); + BufferedWriter bw = new BufferedWriter(fw); + bw.write(content); + bw.close(); + fw.close(); + } + catch (Exception e) { + publish(" !Cannot write file '" + fileName + "'!\n"); + return false; + } + return true; + } + + + private boolean removeFile(String fileName) { + File file = new File(fileName); + if (file.exists()) { + if (! file.delete()) { + publish(" !Cannot delete file '" + file.getAbsolutePath() + "'!\n"); + return false; + } + } + return true; + } + + + private boolean writeFiles() { + for (String rPackage: rPackages) { + String fileName = installDirectoryName + File.separator + "install" + rPackage + ".R"; + String content = "install.packages(\"" + rPackage + "\", repos = \"http://cran.cict.fr\", dependencies = TRUE)\n"; + if (! writeFile(fileName, content)) { + publish(" !Cannot write file for R package '" + rPackage + "'!\n"); + return false; + } + } + String fileName = installDirectoryName + File.separator + "createUser.sql"; + String content = "CREATE USER 'smart'@'localhost';\nGRANT ALL PRIVILEGES ON *.* TO 'smart'@'localhost' WITH GRANT OPTION;\nCREATE DATABASE smart;\nGRANT ALL ON smart.* TO 'smart'@'localhost';\n"; + if (! writeFile(fileName, content)) { + publish(" !Cannot write mySQL configuration file!\n"); + return false; + } + return true; + } + + private boolean removeFiles() { + for (String rPackage: rPackages) { + File file = new File(installDirectoryName + File.separator + "install" + rPackage + ".R"); + if (! file.delete()) { + publish("!Cannot delete R install file for " + rPackage + "!\n"); + return false; + } + } + File file = new File(installDirectoryName + File.separator + "createUser.sql"); + if (! file.delete()) { + publish("!Cannot delete mySQL configuration file!\n"); + return false; + } + return true; + } + + private boolean install(int element) { + publish(" Starting install of " + programChoosers[element] + "\n"); + downloadPackage(element); + executeInstall(element); + uncompressPackage(element); + removePackage(element); + postProcess(element); + publish(" Ending install of " + programChoosers[element] + "\n"); + return true; + } + + + private String getLocalName(String remoteName) { + String localName = installDirectoryName + File.separator + (new File(remoteName)).getName(); + int position = localName.indexOf("?"); + if (position >= 0) { + localName = localName.substring(0, position); + } + return localName; + } + + + private boolean downloadPackage(int element) { + String fileName = packageAddresses[element][architecture]; + if (! "".equals(fileName)) { + publish(" Starting download of " + programChoosers[element] + "\n"); + try { + BufferedInputStream bis = new BufferedInputStream(new URL(fileName).openStream()); + FileOutputStream fos = new FileOutputStream(getLocalName(fileName)); + BufferedOutputStream bos = new BufferedOutputStream(fos, BUFFER); + byte[] data = new byte[BUFFER]; + int x = 0; + while((x = bis.read(data, 0, BUFFER)) >= 0) { + bos.write(data, 0, x); + } + bos.close(); + fos.close(); + bis.close(); + } + catch (IOException e) { + publish(" !Cannot download file '" + fileName + "'!\n"); + return false; + } + publish(" Ending download of " + programChoosers[element] + "\n"); + } + return true; + } + + + private String replaceSubstring(String line) { + if (line.contains("")) { + String protectedDirectory = installDirectoryName.replaceAll("\\\\", "\\\\\\\\"); + line = line.replaceAll("", protectedDirectory); + } + if (line.contains("")) { + String userName = System.getenv().get("USERNAME"); + String[] possibleRDirectories = {"C:\\Program Files\\R-2.11.0", "C:\\Documents and Settings\\" + userName + "\\Mes documents\\R\\R-2.11.0\\bin", "C:\\Documents and Settings\\" + userName + "\\My documents\\R\\R-2.11.0\\bin"}; + String rDirectory = lookForFile("'.exe", possibleRDirectories).getAbsolutePath(); + rDirectory = rDirectory.replaceAll("\\\\", "\\\\\\\\"); + line = line.replaceAll("", rDirectory); + } + if (line.contains("")) { + String userName = System.getenv().get("USERNAME"); + String[] possibleRDirectories = {"C:\\Program Files\\MySQL\\MySQL Server 5.1\\bin", "C:\\Documents and Settings\\" + userName + "\\Mes documents\\MySQL\\MySQL Server 5.1\\bin", "C:\\Documents and Settings\\" + userName + "\\My documents\\MySQL\\MySQL Server 5.1\\bin"}; + String rDirectory = lookForFile("mysql.exe", possibleRDirectories).getAbsolutePath(); + rDirectory = rDirectory.replaceAll("\\\\", "\\\\\\\\"); + line = line.replaceAll("", rDirectory); + } + return line; + } + + + private boolean executeInstall(int element) { + String commands = scriptLines[element][architecture]; + if (! "".equals(commands)) { + for (String command: commands.split(";")) { + command = replaceSubstring(command); + publish(" Starting command '" + command + "'\n"); + Process process = null; + try { + process = Runtime.getRuntime().exec(command); + } + catch (IOException e) { + publish(" !Cannot execute command '" + command + "'!\n"); + return false; + } + try { + process.waitFor(); + } + catch (InterruptedException e) { + publish(" !Cannot wait for the end of the command '" + command + "'!\n"); + return false; + } + int exitValue = process.exitValue(); + if (exitValue != 0) { + publish(" !Problem during the execution of the command '" + command + "'!\n"); + return false; + } + publish(" Ending command '" + command + "'\n"); + } + } + return true; + } + + + private boolean uncompressPackage(int element) { + String file = compressedFiles[element][architecture]; + if (! "".equals(file)) { + file = replaceSubstring(file); + publish(" Starting uncompressing file '" + file + "'\n"); + try { + FileInputStream fis = new FileInputStream(file); + BufferedInputStream bis = new BufferedInputStream(fis); + ZipInputStream zis = new ZipInputStream(bis); + ZipEntry entry; + while ((entry = zis.getNextEntry()) != null) { + if (! entry.isDirectory()) { + File newFile = new File(installDirectoryName + File.separator + entry.getName()); + // create parent directories + File upDirectory = newFile.getParentFile(); + while (upDirectory != null){ + if (! upDirectory.exists()) { + upDirectory.mkdir(); + publish(" Creating directory '" + upDirectory.getAbsolutePath() + "'\n"); + } + upDirectory = upDirectory.getParentFile(); + } + // write the files to the disk + publish(" Extracting '" + entry.getName() + "' to '" + newFile.getAbsolutePath() + "'\n"); + int count; + byte data[] = new byte[BUFFER]; + FileOutputStream fos = new FileOutputStream(newFile); + BufferedOutputStream bos = new BufferedOutputStream(fos, BUFFER); + while ((count = zis.read(data, 0, BUFFER)) != -1){ + bos.write(data, 0, count); + } + bos.flush(); + bos.close(); + fos.close(); + } + } + zis.close(); + bis.close(); + fis.close(); + } + catch(FileNotFoundException e) { + publish(" !Cannot find file '" + file + "'!\n"); + return false; + } + catch(Exception e){ + publish(" !Cannot uncompress file '" + file + "'!\n"); + return false; + } + publish(" Ending uncompressing file '" + file + "'\n"); + } + return true; + } + + + private boolean removePackage(int element) { + String packageName = packageAddresses[element][architecture]; + if ("".equals(packageName)) { + return true; + } + String fileName = getLocalName(packageAddresses[element][architecture]); + return removeFile(fileName); + } + + + private boolean postProcess(int element) { + switch (element) { + case 4: + // Create mySQL user + PasswordAsker pa = new PasswordAsker(); + if (! pa.waitForPassword()) { + publish("Problem in the password asker!\n"); + return false; + } + String command = "\"\" --user=root --password=" + pa.getPassword() + " -e \"source \\createUser.sql\""; + command = replaceSubstring(command); + if (! launch(command)) { + publish(" !Cannot create SQL accounts!\n"); + return false; + } + return true; + case 7: + // Move S-MART files to parent directory + File installDirectory = new File(installDirectoryName + File.separator + "S-Mart"); + for (File file: installDirectory.listFiles()) { + File destinationFile = new File(file.getParentFile().getParentFile(), file.getName()); + if (! file.renameTo(destinationFile)) { + publish(" !Cannot move '" + file.getAbsolutePath() + "' to '" + destinationFile.getAbsolutePath() + "'!\n"); + } + } + if (! installDirectory.delete()) { + publish(" !Cannot remove installation S-MART directory '" + installDirectory.getAbsolutePath() + "'!\n"); + } + } + return true; + } + + + private boolean setEnvironmentVariables() { + String[] command = {"REG", "ADD", "HKCU\\Environment", "/v", "PYTHONPATH", "/t", "REG_SZ", "/d", "\"" + installDirectoryName + "\\Python\"", "/f"}; + return launch(command); + } +} + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/PasswordAsker.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Installer/PasswordAsker.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,87 @@ +import java.awt.*; +import java.awt.event.*; +import javax.swing.*; +import java.util.concurrent.CountDownLatch; + +public class PasswordAsker { + + static String password; + static JFrame frame; + static CountDownLatch latch; + + + public PasswordAsker() { + password = null; + javax.swing.SwingUtilities.invokeLater(new Runnable() { + public void run() { + createAndShowGUI(); + } + }); + latch = new CountDownLatch(1); + } + + + private static void createAndShowGUI() { + //Create and set up the window. + frame = new JFrame("Password"); + frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + frame.setContentPane(setMainPane()); + + //Display the window. + frame.pack(); + frame.setVisible(true); + } + + + private static JPanel setMainPane() { + JPanel rootPanel = new JPanel(false); + rootPanel.setLayout(new GridLayout(0, 1)); + + JPanel infoPanel = new JPanel(false); + JLabel infoLabel = new JLabel("Please write here the password that you entered for the mySQL root account.\r\nNo information is stored nor sent. I promise."); + infoPanel.add(infoLabel); + + JPanel passPanel = new JPanel(false); + passPanel.setLayout(new GridLayout(1, 0)); + JLabel passLabel = new JLabel("password"); + final JTextField passText = new JTextField(20); + passLabel.setLabelFor(passText); + passPanel.add(passLabel); + passPanel.add(passText); + + JPanel okPanel = new JPanel(false); + JButton okButton = new JButton("OK"); + okPanel.add(okButton); + + okButton.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + password = passText.getText(); + frame.setVisible(false); + frame.dispose(); + latch.countDown(); + } + }); + + rootPanel.add(infoPanel); + rootPanel.add(passPanel); + rootPanel.add(okPanel); + + return rootPanel; + } + + + public boolean waitForPassword() { + try { + latch.await(); + } + catch (InterruptedException e) { + return false; + } + return true; + } + + + public String getPassword() { + return password; + } +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/SmartInstaller.jar Binary file smart_toolShed/SMART/Java/Installer/SmartInstaller.jar has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/SmartInstaller.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Installer/SmartInstaller.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,167 @@ +import java.util.*; +import java.awt.*; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.io.*; +import javax.swing.*; +import javax.swing.filechooser.*; +import javax.swing.border.*; +import javax.swing.SwingUtilities; +import java.net.*; + +public class SmartInstaller extends JPanel implements ActionListener { + int BUFFER = 1024; + + JFrame mainFrame; + JTextArea logArea; + + // configuration chooser buttons + String configurations[] = {"32 bits", "64 bits"}; + JRadioButton configurationButtons[]; + + // program chooser buttons + String programChoosers[] = {"R", "R Color Brewer Package", "R HMisc Package", "Python 2.6", "S-MART"}; + JCheckBox programChooserButtons[]; + + JButton goButton; + + // install directory + JButton installDirectoryChooserButton; + JTextField installDirectoryChooserTextField; + + + public SmartInstaller() { + super(); + + Box box = Box.createVerticalBox(); + + // Header + JPanel headerPanel = new JPanel(false); + JTextArea headerArea = new JTextArea("This is the S-MART installation tool.\r\nIt will download and install the needed softwares, as well as S-MART itself.\r\nYou can unselect the software that you already have installed.\r\nDuring the installation, accept all the default parameters."); + TitledBorder headerBorder = BorderFactory.createTitledBorder("Welcome to the S-MART installer!"); + headerArea.setEditable(false); + headerArea.setBackground(headerPanel.getBackground()); + headerPanel.add(headerArea); + headerPanel.setBorder(headerBorder); + + + // Configuration + JPanel configurationPanel = new JPanel(false); + configurationPanel.setLayout(new GridLayout(1, 0)); + configurationButtons = new JRadioButton[configurations.length]; + ButtonGroup configurationGroup = new ButtonGroup(); + for (int i = 0; i < configurations.length; i++) { + JRadioButton button = new JRadioButton(configurations[i]); + configurationPanel.add(button); + configurationButtons[i] = button; + configurationGroup.add(button); + } + configurationButtons[0].setSelected(true); + TitledBorder configurationBorder = BorderFactory.createTitledBorder("Configuration"); + configurationPanel.setBorder(configurationBorder); + + + // Program chooser panel + JPanel programPanel = new JPanel(false); + programPanel.setLayout(new GridLayout(0, 1)); + + JLabel programLabel = new JLabel("Choose which programs to install:"); + programPanel.add(programLabel); + programChooserButtons = new JCheckBox[programChoosers.length]; + for (int i = 0; i < programChoosers.length; i++) { + JCheckBox button = new JCheckBox(programChoosers[i]); + button.setSelected(true); + programPanel.add(button); + programChooserButtons[i] = button; + } + TitledBorder programBorder = BorderFactory.createTitledBorder("Programs"); + programPanel.setBorder(programBorder); + + // Install directory chooser + JPanel installDirectoryChooserPanel = new JPanel(false); + installDirectoryChooserPanel.setLayout(new GridLayout(1, 0)); + JLabel installDirectoryChooserLabel = new JLabel("Choose a directory to install S-MART: "); + installDirectoryChooserTextField = new JTextField(); + installDirectoryChooserButton = new JButton("Open..."); + installDirectoryChooserButton.addActionListener(this); + + installDirectoryChooserPanel.add(installDirectoryChooserLabel); + installDirectoryChooserPanel.add(installDirectoryChooserTextField); + installDirectoryChooserPanel.add(installDirectoryChooserButton); + TitledBorder installDirectoryChooserBorder = BorderFactory.createTitledBorder("Installation directory"); + installDirectoryChooserPanel.setBorder(installDirectoryChooserBorder); + + // GO! + JPanel goPanel = new JPanel(false); + goButton = new JButton("GO!"); + goButton.addActionListener(this); + goButton.setSelected(true); + goPanel.add(goButton); + TitledBorder goBorder = BorderFactory.createTitledBorder("Start install"); + goPanel.setBorder(goBorder); + + // Log + logArea = new JTextArea(10, 120); + logArea.setFont(new Font("Monospaced", logArea.getFont().getStyle(), logArea.getFont().getSize())); + JScrollPane logScroll = new JScrollPane(logArea, JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED); + TitledBorder logBorder = BorderFactory.createTitledBorder("Log"); + logScroll.setBorder(logBorder); + + GridLayout horizontalLayout = new GridLayout(1, 0); + + box.add(headerPanel); + box.add(configurationPanel); + box.add(programPanel); + box.add(installDirectoryChooserPanel); + box.add(goPanel); + box.add(logScroll); + + add(box); + } + + + public void actionPerformed(ActionEvent e) { + + // Install directories chooser + if (e.getSource() == goButton) { + boolean[] selectedPrograms = new boolean[programChoosers.length]; + for (int i = 0; i < programChoosers.length; i++) { + selectedPrograms[i] = programChooserButtons[i].isSelected(); + } + SmartInstallerTask task = new SmartInstallerTask(logArea, selectedPrograms, installDirectoryChooserTextField.getText(), (configurationButtons[0].isSelected())? 0: 1); + task.execute(); + } + // Install directories chooser + else if (e.getSource() == installDirectoryChooserButton) { + JFileChooser chooser = new JFileChooser(); + chooser.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY); + if (chooser.showOpenDialog(mainFrame) == JFileChooser.APPROVE_OPTION) { + installDirectoryChooserTextField.setText(chooser.getSelectedFile().getPath()); + } + } + } + + private static void createAndShowGUI() { + // Create and set up the window. + JFrame mainFrame = new JFrame("S-Mart Installer"); + mainFrame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + + //Create and set up the content pane. + JComponent newContentPane = new SmartInstaller(); + newContentPane.setOpaque(true); + mainFrame.setContentPane(newContentPane); + + // Display the window. + mainFrame.pack(); + mainFrame.setVisible(true); + } + + + public static void main(String[] args) { + javax.swing.SwingUtilities.invokeLater(new Runnable() { + public void run() { + createAndShowGUI(); + } + }); + } +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/SmartInstallerTask.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Installer/SmartInstallerTask.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,419 @@ +import java.util.*; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.io.*; +import javax.swing.*; +import javax.swing.filechooser.*; +import javax.swing.border.*; +import javax.swing.SwingUtilities; +import java.net.*; +import java.util.Stack; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +public class SmartInstallerTask extends SwingWorker { + + int BUFFER = 1024; + + int architecture = 0; + String installDirectoryName = null; + JTextArea logArea = null; + boolean[] selectedPrograms = null; + + // program chooser buttons + String programChoosers[] = {"R", "R Color Brewer Package", "R HMisc Package", "Python 2.6", "S-MART"}; + + // Web addresses for the tools + String packageAddresses[][] = { + {"http://cran.cict.fr/bin/windows/base/R-2.11.0-win32.exe", "http://cran.cict.fr/bin/windows64/base/R-2.11.0-win64.exe"}, + {"", ""}, + {"", ""}, + {"http://www.python.org/ftp/python/2.6.5/python-2.6.5.msi", "http://www.python.org/ftp/python/2.6.5/python-2.6.5.amd64.msi"}, + {"http://urgi.versailles.inra.fr/content/download/1929/17848/file/s-mart-1.0.15.zip", "http://urgi.versailles.inra.fr/content/download/1929/17848/file/s-mart-1.0.15.zip"} + }; + + // Packages to install + String rPackages[] = {"RColorBrewer", "Hmisc"}; + + // Script lines + String scriptLines[][] = { + {"\"\\R-2.11.0-win32.exe\"", "\"\\R-2.11.0-win64.exe\""}, + {"\"\" CMD BATCH \"\\installRColorBrewer.R\"", "\"\" CMD BATCH \"\\installRColorBrewer.R\""}, + {"\"\" CMD BATCH \"\\installHmisc.R\"", "\"\" CMD BATCH \"\\installHmisc.R\""}, + {"msiexec /i \"\\python-2.6.5.msi\"", "msiexec /i \"\\python-2.6.5.amd64.msi\""}, + {"", ""} + }; + + // Files to uncompress + String compressedFiles[][] = { + {"", ""}, + {"", ""}, + {"", ""}, + {"", ""}, + {"\\s-mart-1.0.15.zip", "\\s-mart-1.0.15.zip"} + }; + + + public SmartInstallerTask(JTextArea ta, boolean[] b, String s, int a) { + logArea = ta; + selectedPrograms = b; + installDirectoryName = s; + architecture = a; + } + + + @Override + public Boolean doInBackground() { + boolean installOk; + publish("Starting install\n"); + writeFiles(); + for (int i = 0; i < selectedPrograms.length; i++) { + if (selectedPrograms[i]) { + if (! install(i)) { + return Boolean.FALSE; + } + } + } + removeFiles(); + setEnvironmentVariables(); + publish("Ending install\n"); + return Boolean.TRUE; + } + + + @Override + protected void process(List chunks) { + for (String chunk: chunks) { + logArea.append(chunk); + } + } + + + private boolean launch(String command) { + return realLaunch(new ProcessBuilder(command), command); + } + + private boolean launch(String[] command) { + return realLaunch(new ProcessBuilder(command), Arrays.toString(command)); + } + + private boolean realLaunch(ProcessBuilder pb, String command) { + BufferedReader outputReader; + pb = pb.redirectErrorStream(true); + Process process = null; + publish(" Starting command '" + command + "'\n"); + try { + process = pb.start(); + BufferedInputStream outputStream = new BufferedInputStream(process.getInputStream()); + InputStream is = process.getInputStream(); + InputStreamReader isr = new InputStreamReader(is); + outputReader = new BufferedReader(isr); + } + catch (Exception exception) { + publish(" !Process cannot be started (command is '" + command + "')!\n"); + exception.printStackTrace(); + return false; + } + if (outputReader == null) { + publish(" !Problem in the output of the command!\n"); + return false; + } + else { + publish(" Output is:\n"); + try { + publish(" ---\n"); + String line; + while ((line = outputReader.readLine()) != null) { + publish(" " + line + "\r\n"); + } + publish(" ---\n"); + } + catch (IOException e) { + publish(" !Cannot get the output of the command!\n"); + return false; + } + } + int exitValue = process.exitValue(); + if (exitValue != 0) { + publish(" !Problem during the execution of the command '" + command + "'!\n"); + return false; + } + publish(" Ending command '" + command + "'\n"); + return true; + } + + + private File lookForFile(String fileName, String[] putativePlaces) { + publish(" Looking for file " + fileName + "\n"); + for (String place: putativePlaces) { + File file = new File(place, fileName); + publish(" Look at " + file.getAbsolutePath() + "\n"); + if (file.exists()) { + publish(" Found it in expected place " + file.getAbsolutePath() + "\n"); + return file; + } + } + Stack files = new Stack(); + files.push(new File("\\")); + while (! files.empty()) { + File file = files.pop(); + for (File childFile: file.listFiles()) { + if (childFile.isDirectory()) { + files.push(childFile); + } + else { + if (fileName.compareToIgnoreCase(childFile.getName()) == 0) { + publish(" Found it in unexpected place " + childFile.getAbsolutePath() + "\n"); + return childFile; + } + } + } + } + publish(" !Cannot file file '" + fileName + "'!\n"); + return null; + } + + + private boolean writeFile(String fileName, String content) { + try { + FileWriter fw = new FileWriter(fileName); + BufferedWriter bw = new BufferedWriter(fw); + bw.write(content); + bw.close(); + fw.close(); + } + catch (Exception e) { + publish(" !Cannot write file '" + fileName + "'!\n"); + return false; + } + return true; + } + + + private boolean removeFile(String fileName) { + File file = new File(fileName); + if (file.exists()) { + if (! file.delete()) { + publish(" !Cannot delete file '" + file.getAbsolutePath() + "'!\n"); + return false; + } + } + return true; + } + + + private boolean writeFiles() { + for (String rPackage: rPackages) { + String fileName = installDirectoryName + File.separator + "install" + rPackage + ".R"; + String content = "install.packages(\"" + rPackage + "\", repos = \"http://cran.cict.fr\", dependencies = TRUE)\n"; + if (! writeFile(fileName, content)) { + publish(" !Cannot write file for R package '" + rPackage + "'!\n"); + return false; + } + } + return true; + } + + private boolean removeFiles() { + for (String rPackage: rPackages) { + File file = new File(installDirectoryName + File.separator + "install" + rPackage + ".R"); + if (! file.delete()) { + publish("!Cannot delete R install file for " + rPackage + "!\n"); + return false; + } + } + File file = new File(installDirectoryName + File.separator + "createUser.sql"); + if (! file.delete()) { + publish("!Cannot delete mySQL configuration file!\n"); + return false; + } + return true; + } + + private boolean install(int element) { + publish(" Starting install of " + programChoosers[element] + "\n"); + downloadPackage(element); + executeInstall(element); + uncompressPackage(element); + removePackage(element); + postProcess(element); + publish(" Ending install of " + programChoosers[element] + "\n"); + return true; + } + + + private String getLocalName(String remoteName) { + String localName = installDirectoryName + File.separator + (new File(remoteName)).getName(); + int position = localName.indexOf("?"); + if (position >= 0) { + localName = localName.substring(0, position); + } + return localName; + } + + + private boolean downloadPackage(int element) { + String fileName = packageAddresses[element][architecture]; + if (! "".equals(fileName)) { + publish(" Starting download of " + programChoosers[element] + "\n"); + try { + BufferedInputStream bis = new BufferedInputStream(new URL(fileName).openStream()); + FileOutputStream fos = new FileOutputStream(getLocalName(fileName)); + BufferedOutputStream bos = new BufferedOutputStream(fos, BUFFER); + byte[] data = new byte[BUFFER]; + int x = 0; + while((x = bis.read(data, 0, BUFFER)) >= 0) { + bos.write(data, 0, x); + } + bos.close(); + fos.close(); + bis.close(); + } + catch (IOException e) { + publish(" !Cannot download file '" + fileName + "'!\n"); + return false; + } + publish(" Ending download of " + programChoosers[element] + "\n"); + } + return true; + } + + + private String replaceSubstring(String line) { + if (line.contains("")) { + String protectedDirectory = installDirectoryName.replaceAll("\\\\", "\\\\\\\\"); + line = line.replaceAll("", protectedDirectory); + } + if (line.contains("")) { + String userName = System.getenv().get("USERNAME"); + String[] possibleRDirectories = {"C:\\Program Files\\R-2.11.0", "C:\\Documents and Settings\\" + userName + "\\Mes documents\\R\\R-2.11.0\\bin", "C:\\Documents and Settings\\" + userName + "\\My documents\\R\\R-2.11.0\\bin"}; + String rDirectory = lookForFile("'.exe", possibleRDirectories).getAbsolutePath(); + rDirectory = rDirectory.replaceAll("\\\\", "\\\\\\\\"); + line = line.replaceAll("", rDirectory); + } + return line; + } + + + private boolean executeInstall(int element) { + String commands = scriptLines[element][architecture]; + if (! "".equals(commands)) { + for (String command: commands.split(";")) { + command = replaceSubstring(command); + publish(" Starting command '" + command + "'\n"); + Process process = null; + try { + process = Runtime.getRuntime().exec(command); + } + catch (IOException e) { + publish(" !Cannot execute command '" + command + "'!\n"); + return false; + } + try { + process.waitFor(); + } + catch (InterruptedException e) { + publish(" !Cannot wait for the end of the command '" + command + "'!\n"); + return false; + } + int exitValue = process.exitValue(); + if (exitValue != 0) { + publish(" !Problem during the execution of the command '" + command + "'!\n"); + return false; + } + publish(" Ending command '" + command + "'\n"); + } + } + return true; + } + + + private boolean uncompressPackage(int element) { + String file = compressedFiles[element][architecture]; + if (! "".equals(file)) { + file = replaceSubstring(file); + publish(" Starting uncompressing file '" + file + "'\n"); + try { + FileInputStream fis = new FileInputStream(file); + BufferedInputStream bis = new BufferedInputStream(fis); + ZipInputStream zis = new ZipInputStream(bis); + ZipEntry entry; + while ((entry = zis.getNextEntry()) != null) { + if (! entry.isDirectory()) { + File newFile = new File(installDirectoryName + File.separator + entry.getName()); + // create parent directories + File upDirectory = newFile.getParentFile(); + while (upDirectory != null){ + if (! upDirectory.exists()) { + upDirectory.mkdir(); + publish(" Creating directory '" + upDirectory.getAbsolutePath() + "'\n"); + } + upDirectory = upDirectory.getParentFile(); + } + // write the files to the disk + publish(" Extracting '" + entry.getName() + "' to '" + newFile.getAbsolutePath() + "'\n"); + int count; + byte data[] = new byte[BUFFER]; + FileOutputStream fos = new FileOutputStream(newFile); + BufferedOutputStream bos = new BufferedOutputStream(fos, BUFFER); + while ((count = zis.read(data, 0, BUFFER)) != -1){ + bos.write(data, 0, count); + } + bos.flush(); + bos.close(); + fos.close(); + } + } + zis.close(); + bis.close(); + fis.close(); + } + catch(FileNotFoundException e) { + publish(" !Cannot find file '" + file + "'!\n"); + return false; + } + catch(Exception e){ + publish(" !Cannot uncompress file '" + file + "'!\n"); + return false; + } + publish(" Ending uncompressing file '" + file + "'\n"); + } + return true; + } + + + private boolean removePackage(int element) { + String packageName = packageAddresses[element][architecture]; + if ("".equals(packageName)) { + return true; + } + String fileName = getLocalName(packageAddresses[element][architecture]); + return removeFile(fileName); + } + + + private boolean postProcess(int element) { + switch (element) { + case 4: + // Move S-MART files to parent directory + File installDirectory = new File(installDirectoryName + File.separator + "S-Mart"); + for (File file: installDirectory.listFiles()) { + File destinationFile = new File(file.getParentFile().getParentFile(), file.getName()); + if (! file.renameTo(destinationFile)) { + publish(" !Cannot move '" + file.getAbsolutePath() + "' to '" + destinationFile.getAbsolutePath() + "'!\n"); + } + } + if (! installDirectory.delete()) { + publish(" !Cannot remove installation S-MART directory '" + installDirectory.getAbsolutePath() + "'!\n"); + } + } + return true; + } + + + private boolean setEnvironmentVariables() { + String[] command = {"REG", "ADD", "HKCU\\Environment", "/v", "PYTHONPATH", "/t", "REG_SZ", "/d", "\"" + installDirectoryName + "\\Python\"", "/f"}; + return launch(command); + } +} + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/build.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Installer/build.sh Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,5 @@ +#! /bin/sh + +rm -rf SmartInstaller.jar +javac *.java +jar cvfm SmartInstaller.jar manifest.txt *.class diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/manifest.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Installer/manifest.txt Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Created-By: Matthias Zytnicki +Main-Class: SmartInstaller diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Installer/s-mart.zip Binary file smart_toolShed/SMART/Java/Installer/s-mart.zip has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/LICENSE.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/LICENSE.txt Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,506 @@ + +CeCILL FREE SOFTWARE LICENSE AGREEMENT + + + Notice + +This Agreement is a Free Software license agreement that is the result +of discussions between its authors in order to ensure compliance with +the two main principles guiding its drafting: + + * firstly, compliance with the principles governing the distribution + of Free Software: access to source code, broad rights granted to + users, + * secondly, the election of a governing law, French law, with which + it is conformant, both as regards the law of torts and + intellectual property law, and the protection that it offers to + both authors and holders of the economic rights over software. + +The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) +license are: + +Commissariat à l'Energie Atomique - CEA, a public scientific, technical +and industrial research establishment, having its principal place of +business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France. + +Centre National de la Recherche Scientifique - CNRS, a public scientific +and technological establishment, having its principal place of business +at 3 rue Michel-Ange, 75794 Paris cedex 16, France. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, a public scientific and technological establishment, having its +principal place of business at Domaine de Voluceau, Rocquencourt, BP +105, 78153 Le Chesnay cedex, France. + + + Preamble + +The purpose of this Free Software license agreement is to grant users +the right to modify and redistribute the software governed by this +license within the framework of an open source distribution model. + +The exercising of these rights is conditional upon certain obligations +for users so as to preserve this status for all subsequent redistributions. + +In consideration of access to the source code and the rights to copy, +modify and redistribute granted by the license, users are provided only +with a limited warranty and the software's author, the holder of the +economic rights, and the successive licensors only have limited liability. + +In this respect, the risks associated with loading, using, modifying +and/or developing or reproducing the software by the user are brought to +the user's attention, given its Free Software status, which may make it +complicated to use, with the result that its use is reserved for +developers and experienced professionals having in-depth computer +knowledge. Users are therefore encouraged to load and test the +suitability of the software as regards their requirements in conditions +enabling the security of their systems and/or data to be ensured and, +more generally, to use and operate it in the same conditions of +security. This Agreement may be freely reproduced and published, +provided it is not altered, and that no provisions are either added or +removed herefrom. + +This Agreement may apply to any or all software for which the holder of +the economic rights decides to submit the use thereof to its provisions. + + + Article 1 - DEFINITIONS + +For the purpose of this Agreement, when the following expressions +commence with a capital letter, they shall have the following meaning: + +Agreement: means this license agreement, and its possible subsequent +versions and annexes. + +Software: means the software in its Object Code and/or Source Code form +and, where applicable, its documentation, "as is" when the Licensee +accepts the Agreement. + +Initial Software: means the Software in its Source Code and possibly its +Object Code form and, where applicable, its documentation, "as is" when +it is first distributed under the terms and conditions of the Agreement. + +Modified Software: means the Software modified by at least one +Contribution. + +Source Code: means all the Software's instructions and program lines to +which access is required so as to modify the Software. + +Object Code: means the binary files originating from the compilation of +the Source Code. + +Holder: means the holder(s) of the economic rights over the Initial +Software. + +Licensee: means the Software user(s) having accepted the Agreement. + +Contributor: means a Licensee having made at least one Contribution. + +Licensor: means the Holder, or any other individual or legal entity, who +distributes the Software under the Agreement. + +Contribution: means any or all modifications, corrections, translations, +adaptations and/or new functions integrated into the Software by any or +all Contributors, as well as any or all Internal Modules. + +Module: means a set of sources files including their documentation that +enables supplementary functions or services in addition to those offered +by the Software. + +External Module: means any or all Modules, not derived from the +Software, so that this Module and the Software run in separate address +spaces, with one calling the other when they are run. + +Internal Module: means any or all Module, connected to the Software so +that they both execute in the same address space. + +GNU GPL: means the GNU General Public License version 2 or any +subsequent version, as published by the Free Software Foundation Inc. + +Parties: mean both the Licensee and the Licensor. + +These expressions may be used both in singular and plural form. + + + Article 2 - PURPOSE + +The purpose of the Agreement is the grant by the Licensor to the +Licensee of a non-exclusive, transferable and worldwide license for the +Software as set forth in Article 5 hereinafter for the whole term of the +protection granted by the rights over said Software. + + + Article 3 - ACCEPTANCE + +3.1 The Licensee shall be deemed as having accepted the terms and +conditions of this Agreement upon the occurrence of the first of the +following events: + + * (i) loading the Software by any or all means, notably, by + downloading from a remote server, or by loading from a physical + medium; + * (ii) the first time the Licensee exercises any of the rights + granted hereunder. + +3.2 One copy of the Agreement, containing a notice relating to the +characteristics of the Software, to the limited warranty, and to the +fact that its use is restricted to experienced users has been provided +to the Licensee prior to its acceptance as set forth in Article 3.1 +hereinabove, and the Licensee hereby acknowledges that it has read and +understood it. + + + Article 4 - EFFECTIVE DATE AND TERM + + + 4.1 EFFECTIVE DATE + +The Agreement shall become effective on the date when it is accepted by +the Licensee as set forth in Article 3.1. + + + 4.2 TERM + +The Agreement shall remain in force for the entire legal term of +protection of the economic rights over the Software. + + + Article 5 - SCOPE OF RIGHTS GRANTED + +The Licensor hereby grants to the Licensee, who accepts, the following +rights over the Software for any or all use, and for the term of the +Agreement, on the basis of the terms and conditions set forth hereinafter. + +Besides, if the Licensor owns or comes to own one or more patents +protecting all or part of the functions of the Software or of its +components, the Licensor undertakes not to enforce the rights granted by +these patents against successive Licensees using, exploiting or +modifying the Software. If these patents are transferred, the Licensor +undertakes to have the transferees subscribe to the obligations set +forth in this paragraph. + + + 5.1 RIGHT OF USE + +The Licensee is authorized to use the Software, without any limitation +as to its fields of application, with it being hereinafter specified +that this comprises: + + 1. permanent or temporary reproduction of all or part of the Software + by any or all means and in any or all form. + + 2. loading, displaying, running, or storing the Software on any or + all medium. + + 3. entitlement to observe, study or test its operation so as to + determine the ideas and principles behind any or all constituent + elements of said Software. This shall apply when the Licensee + carries out any or all loading, displaying, running, transmission + or storage operation as regards the Software, that it is entitled + to carry out hereunder. + + + 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS + +The right to make Contributions includes the right to translate, adapt, +arrange, or make any or all modifications to the Software, and the right +to reproduce the resulting software. + +The Licensee is authorized to make any or all Contributions to the +Software provided that it includes an explicit notice that it is the +author of said Contribution and indicates the date of the creation thereof. + + + 5.3 RIGHT OF DISTRIBUTION + +In particular, the right of distribution includes the right to publish, +transmit and communicate the Software to the general public on any or +all medium, and by any or all means, and the right to market, either in +consideration of a fee, or free of charge, one or more copies of the +Software by any means. + +The Licensee is further authorized to distribute copies of the modified +or unmodified Software to third parties according to the terms and +conditions set forth hereinafter. + + + 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION + +The Licensee is authorized to distribute true copies of the Software in +Source Code or Object Code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the Object Code of the Software is +redistributed, the Licensee allows future Licensees unhindered access to +the full Source Code of the Software by indicating how to access it, it +being understood that the additional cost of acquiring the Source Code +shall not exceed the cost of transferring the data. + + + 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE + +When the Licensee makes a Contribution to the Software, the terms and +conditions for the distribution of the resulting Modified Software +become subject to all the provisions of this Agreement. + +The Licensee is authorized to distribute the Modified Software, in +source code or object code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the object code of the Modified +Software is redistributed, the Licensee allows future Licensees +unhindered access to the full source code of the Modified Software by +indicating how to access it, it being understood that the additional +cost of acquiring the source code shall not exceed the cost of +transferring the data. + + + 5.3.3 DISTRIBUTION OF EXTERNAL MODULES + +When the Licensee has developed an External Module, the terms and +conditions of this Agreement do not apply to said External Module, that +may be distributed under a separate license agreement. + + + 5.3.4 COMPATIBILITY WITH THE GNU GPL + +The Licensee can include a code that is subject to the provisions of one +of the versions of the GNU GPL in the Modified or unmodified Software, +and distribute that entire code under the terms of the same version of +the GNU GPL. + +The Licensee can include the Modified or unmodified Software in a code +that is subject to the provisions of one of the versions of the GNU GPL, +and distribute that entire code under the terms of the same version of +the GNU GPL. + + + Article 6 - INTELLECTUAL PROPERTY + + + 6.1 OVER THE INITIAL SOFTWARE + +The Holder owns the economic rights over the Initial Software. Any or +all use of the Initial Software is subject to compliance with the terms +and conditions under which the Holder has elected to distribute its work +and no one shall be entitled to modify the terms and conditions for the +distribution of said Initial Software. + +The Holder undertakes that the Initial Software will remain ruled at +least by this Agreement, for the duration set forth in Article 4.2. + + + 6.2 OVER THE CONTRIBUTIONS + +The Licensee who develops a Contribution is the owner of the +intellectual property rights over this Contribution as defined by +applicable law. + + + 6.3 OVER THE EXTERNAL MODULES + +The Licensee who develops an External Module is the owner of the +intellectual property rights over this External Module as defined by +applicable law and is free to choose the type of agreement that shall +govern its distribution. + + + 6.4 JOINT PROVISIONS + +The Licensee expressly undertakes: + + 1. not to remove, or modify, in any manner, the intellectual property + notices attached to the Software; + + 2. to reproduce said notices, in an identical manner, in the copies + of the Software modified or not. + +The Licensee undertakes not to directly or indirectly infringe the +intellectual property rights of the Holder and/or Contributors on the +Software and to take, where applicable, vis-à-vis its staff, any and all +measures required to ensure respect of said intellectual property rights +of the Holder and/or Contributors. + + + Article 7 - RELATED SERVICES + +7.1 Under no circumstances shall the Agreement oblige the Licensor to +provide technical assistance or maintenance services for the Software. + +However, the Licensor is entitled to offer this type of services. The +terms and conditions of such technical assistance, and/or such +maintenance, shall be set forth in a separate instrument. Only the +Licensor offering said maintenance and/or technical assistance services +shall incur liability therefor. + +7.2 Similarly, any Licensor is entitled to offer to its licensees, under +its sole responsibility, a warranty, that shall only be binding upon +itself, for the redistribution of the Software and/or the Modified +Software, under terms and conditions that it is free to decide. Said +warranty, and the financial terms and conditions of its application, +shall be subject of a separate instrument executed between the Licensor +and the Licensee. + + + Article 8 - LIABILITY + +8.1 Subject to the provisions of Article 8.2, the Licensee shall be +entitled to claim compensation for any direct loss it may have suffered +from the Software as a result of a fault on the part of the relevant +Licensor, subject to providing evidence thereof. + +8.2 The Licensor's liability is limited to the commitments made under +this Agreement and shall not be incurred as a result of in particular: +(i) loss due the Licensee's total or partial failure to fulfill its +obligations, (ii) direct or consequential loss that is suffered by the +Licensee due to the use or performance of the Software, and (iii) more +generally, any consequential loss. In particular the Parties expressly +agree that any or all pecuniary or business loss (i.e. loss of data, +loss of profits, operating loss, loss of customers or orders, +opportunity cost, any disturbance to business activities) or any or all +legal proceedings instituted against the Licensee by a third party, +shall constitute consequential loss and shall not provide entitlement to +any or all compensation from the Licensor. + + + Article 9 - WARRANTY + +9.1 The Licensee acknowledges that the scientific and technical +state-of-the-art when the Software was distributed did not enable all +possible uses to be tested and verified, nor for the presence of +possible defects to be detected. In this respect, the Licensee's +attention has been drawn to the risks associated with loading, using, +modifying and/or developing and reproducing the Software which are +reserved for experienced users. + +The Licensee shall be responsible for verifying, by any or all means, +the suitability of the product for its requirements, its good working +order, and for ensuring that it shall not cause damage to either persons +or properties. + +9.2 The Licensor hereby represents, in good faith, that it is entitled +to grant all the rights over the Software (including in particular the +rights set forth in Article 5). + +9.3 The Licensee acknowledges that the Software is supplied "as is" by +the Licensor without any other express or tacit warranty, other than +that provided for in Article 9.2 and, in particular, without any warranty +as to its commercial value, its secured, safe, innovative or relevant +nature. + +Specifically, the Licensor does not warrant that the Software is free +from any error, that it will operate without interruption, that it will +be compatible with the Licensee's own equipment and software +configuration, nor that it will meet the Licensee's requirements. + +9.4 The Licensor does not either expressly or tacitly warrant that the +Software does not infringe any third party intellectual property right +relating to a patent, software or any other property right. Therefore, +the Licensor disclaims any and all liability towards the Licensee +arising out of any or all proceedings for infringement that may be +instituted in respect of the use, modification and redistribution of the +Software. Nevertheless, should such proceedings be instituted against +the Licensee, the Licensor shall provide it with technical and legal +assistance for its defense. Such technical and legal assistance shall be +decided on a case-by-case basis between the relevant Licensor and the +Licensee pursuant to a memorandum of understanding. The Licensor +disclaims any and all liability as regards the Licensee's use of the +name of the Software. No warranty is given as regards the existence of +prior rights over the name of the Software or as regards the existence +of a trademark. + + + Article 10 - TERMINATION + +10.1 In the event of a breach by the Licensee of its obligations +hereunder, the Licensor may automatically terminate this Agreement +thirty (30) days after notice has been sent to the Licensee and has +remained ineffective. + +10.2 A Licensee whose Agreement is terminated shall no longer be +authorized to use, modify or distribute the Software. However, any +licenses that it may have granted prior to termination of the Agreement +shall remain valid subject to their having been granted in compliance +with the terms and conditions hereof. + + + Article 11 - MISCELLANEOUS + + + 11.1 EXCUSABLE EVENTS + +Neither Party shall be liable for any or all delay, or failure to +perform the Agreement, that may be attributable to an event of force +majeure, an act of God or an outside cause, such as defective +functioning or interruptions of the electricity or telecommunications +networks, network paralysis following a virus attack, intervention by +government authorities, natural disasters, water damage, earthquakes, +fire, explosions, strikes and labor unrest, war, etc. + +11.2 Any failure by either Party, on one or more occasions, to invoke +one or more of the provisions hereof, shall under no circumstances be +interpreted as being a waiver by the interested Party of its right to +invoke said provision(s) subsequently. + +11.3 The Agreement cancels and replaces any or all previous agreements, +whether written or oral, between the Parties and having the same +purpose, and constitutes the entirety of the agreement between said +Parties concerning said purpose. No supplement or modification to the +terms and conditions hereof shall be effective as between the Parties +unless it is made in writing and signed by their duly authorized +representatives. + +11.4 In the event that one or more of the provisions hereof were to +conflict with a current or future applicable act or legislative text, +said act or legislative text shall prevail, and the Parties shall make +the necessary amendments so as to comply with said act or legislative +text. All other provisions shall remain effective. Similarly, invalidity +of a provision of the Agreement, for any reason whatsoever, shall not +cause the Agreement as a whole to be invalid. + + + 11.5 LANGUAGE + +The Agreement is drafted in both French and English and both versions +are deemed authentic. + + + Article 12 - NEW VERSIONS OF THE AGREEMENT + +12.1 Any person is authorized to duplicate and distribute copies of this +Agreement. + +12.2 So as to ensure coherence, the wording of this Agreement is +protected and may only be modified by the authors of the License, who +reserve the right to periodically publish updates or new versions of the +Agreement, each with a separate number. These subsequent versions may +address new issues encountered by Free Software. + +12.3 Any Software distributed under a given version of the Agreement may +only be subsequently distributed under the same version of the Agreement +or a subsequent version, subject to the provisions of Article 5.3.4. + + + Article 13 - GOVERNING LAW AND JURISDICTION + +13.1 The Agreement is governed by French law. The Parties agree to +endeavor to seek an amicable solution to any disagreements or disputes +that may arise during the performance of the Agreement. + +13.2 Failing an amicable solution within two (2) months as from their +occurrence, and unless emergency proceedings are necessary, the +disagreements or disputes shall be referred to the Paris Courts having +jurisdiction, by the more diligent Party. + + +Version 2.0 dated 2006-09-05. diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Program.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Program.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,175 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +import java.util.*; +import java.awt.*; +import javax.swing.*; + + +public class Program { + String shortName; + String name; + String section; + String description; + Vector options; + JPanel panel; + JButton button; + + + public Program() { + this.shortName = null; + this.name = null; + this.options = new Vector (); + } + + + public void setShortName(String shortName) { + this.shortName = shortName; + } + + + public void setName(String name) { + this.name = name; + } + + + public void setSection(String section) { + this.section = section; + } + + public void setDescription(String description) { + this.description = description; + } + + + public void addOption(ProgramOption option) { + options.add(option); + } + + + public String getShortName() { + return this.shortName; + } + + + public String getName() { + return this.name; + } + + + public String getSection() { + return this.section; + } + + public String getDescription() { + return this.description; + } + + + public String checkValues() { + for (int i = 0; i < options.size(); i++) { + String comment = options.get(i).checkValue(); + if (comment != null) { + return comment; + } + } + return null; + } + + + public LinkedList getCommand() { + LinkedList parameterList = new LinkedList(); + parameterList.add(Global.pythonCommand); + parameterList.add("Python" + java.io.File.separator + this.shortName); + for (int i = 0; i < options.size(); i++) { + ProgramOption option = options.get(i); + parameterList.addAll(option.getCommand()); + } + return parameterList; + } + + + public JPanel getPanel() { + if (this.panel != null) { + return this.panel; + } + + this.panel = new JPanel(false); + this.panel.setLayout(new FlowLayout()); + Box box = Box.createVerticalBox(); + + JPanel descriptionPanel = new JPanel(false); + JLabel descriptionLabel = new JLabel(this.description); + descriptionPanel.add(descriptionLabel); + box.add(descriptionPanel); + + for (int i = 0; i < options.size(); i++) { + ProgramOption option = options.get(i); + JPanel panel = option.getPanel(); + if (panel == null) { + System.out.println("Problem with Python program '" + this.shortName + "'."); + return null; + } + box.add(option.getPanel()); + } + + JPanel buttonPanel = new JPanel(false); + this.button = new JButton("GO!"); + + buttonPanel.add(button); + + box.add(buttonPanel); + + this.panel.add(box); + + return this.panel; + } + + + public JButton getButton() { + if (this.button == null) { + this.getPanel(); + } + return this.button; + } + + + public Vector < File > getOutputFiles() { + Vector < File > files = new Vector < File > (); + for (int i = 0; i < options.size(); i++) { + ProgramOption option = options.get(i); + if (! option.isInput()) { + files.add(option.getOutputFile()); + } + } + return files; + } +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/ProgramFileReader.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/ProgramFileReader.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,174 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +import java.util.*; +import java.io.File; +import java.io.*; + + +public class ProgramFileReader { + String fileName; + Vector programs; + + + public ProgramFileReader(String fileName) { + this.fileName = fileName; + this.programs = new Vector (); + } + + + public boolean read() { +// File file = new File(this.fileName); +// Program program = null; +// int step = 0; +// TreeMap options = new TreeMap (); + +// try { +// BufferedReader reader = new BufferedReader(new FileReader(file)); +// String line = null; +// String section = null; + +// while ((line = reader.readLine()) != null) { + +// line = line.trim(); + +// if (line.length() == 0) { +// if (program != null) { +// programs.add(program); +// } +// program = null; +// step = 0; +// continue; +// } + +// if ((line.charAt(0) == '[') && (line.charAt(line.length() - 1) == ']')) { +// section = line.substring(1, line.length() - 1).trim(); +// continue; +// } +// switch (step) { +// case 0: +// program = new Program(); +// program.setName(line); +// if (section == null) { +// System.out.println("Error! Section of program '" + line + "' is not set!"); +// } +// program.setSection(section); +// step = 1; +// break; +// case 1: +// program.setShortName(line); +// step = 2; +// break; +// case 2: +// ProgramOption option = new ProgramOption(); + +// String[] elements = line.split(":"); +// boolean input = elements[0].trim().equalsIgnoreCase("input")? true: false; +// String[] subElements = elements[1].split(";"); +// String identifier = subElements[0].trim(); + +// option.setInput(input); + +// if (input) { + +// if (subElements.length < 4) { +// System.out.println("Line '" + line + "' is weird..."); +// } + +// String type = subElements[1].trim(); +// String comment = subElements[2].trim(); +// boolean compulsory = subElements[3].trim().equalsIgnoreCase("0")? false: true; + +// option.setIdentifier(identifier); +// option.setType(type); +// option.setComment(comment); +// option.setCompulsory(compulsory); + +// if ("file".compareToIgnoreCase(type) == 0) { +// if (subElements.length < 5) { +// System.out.println("Line '" + line + "' is weird..."); +// } + +// String formatIdentifier = subElements[4].trim(); +// option.setFormatIdentifier(formatIdentifier); +// } +// else if ("choice".compareToIgnoreCase(type) == 0) { +// if (subElements.length < 5) { +// System.out.println("Line '" + line + "' is weird..."); +// } + +// String[] choices = subElements[4].trim().split(","); +// for (int i = 0; i < choices.length; i++) { +// choices[i] = choices[i].trim(); +// } +// option.setChoices(choices); +// } +// options.put(identifier, option); +// } +// else { +// String format = subElements[1].trim(); + +// option.setFormat(format); +// option.setAssociatedOption(options.get(identifier)); +// } + +// program.addOption(option); + +// break; +// default: +// return false; +// } +// } + +// reader.close(); +// } +// catch (FileNotFoundException e) { +// return false; +// } +// catch (IOException e) { +// return false; +// } + +// if (program != null) { +// programs.add(program); +// } + + return true; + } + + public int getNbPrograms() { + return programs.size(); + } + + public Program getProgram(int i) { + return programs.get(i); + } +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/ProgramLauncher.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/ProgramLauncher.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,209 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +import java.util.*; +import java.io.*; +import javax.swing.SwingUtilities; +import javax.swing.*; +import java.util.concurrent.CountDownLatch; + +public class ProgramLauncher extends SwingWorker { + + String[] command; + JTextArea logArea; + JLabel messageField; + JProgressBar progressBar; + JLabel etaField; + int exitValue; + CountDownLatch latch; + + + + public ProgramLauncher (LinkedList c, JTextArea la, JLabel mf, JProgressBar pb, JLabel ef) { + command = new String[c.size()]; + logArea = la; + messageField = mf; + progressBar = pb; + etaField = ef; + exitValue = -1; + c.toArray(command); + latch = new CountDownLatch(1); + } + + + public ProgramLauncher (String[] c, JTextArea la, JLabel mf, JProgressBar pb, JLabel ef) { + command = c; + logArea = la; + messageField = mf; + progressBar = pb; + etaField = ef; + exitValue = -1; + latch = new CountDownLatch(1); + } + + + @Override + public Boolean doInBackground() { + ProcessBuilder pb = new ProcessBuilder(command); + Process process = null; + BufferedReader outputReader = null; + pb = pb.redirectErrorStream(true); + Map env = pb.environment(); + env.put("PYTHONPATH", System.getProperty("user.dir")); + env.put("SMARTPATH", System.getProperty("user.dir") + java.io.File.separator + "SMART" + java.io.File.separator + "Java" + java.io.File.separator + "Python"); + env.put("SMARTMYSQLPATH", Global.mysqlCommand); + env.put("SMARTRPATH", Global.rCommand); + String commandJoined = Arrays.toString(command); + + try { + publish("=== Starting command '" + commandJoined.trim() + "' ===\n"); + process = pb.start(); + + BufferedInputStream outputStream = new BufferedInputStream(process.getInputStream()); + InputStream is = process.getInputStream(); + InputStreamReader isr = new InputStreamReader(is); + outputReader = new BufferedReader(isr); + } + catch (Exception exception) { + publish("!Process cannot be started (command is '" + commandJoined + "')!\n"); + exception.printStackTrace(); + latch.countDown(); + return Boolean.FALSE; + } + if (outputReader == null) { + publish("!Problem in the output of the command!\n"); + latch.countDown(); + return Boolean.FALSE; + } + else { + try { + String line; + while ((line = outputReader.readLine()) != null) { + publish(line + "\n"); + } + } + catch (IOException e) { + e.printStackTrace(); + publish("!Cannot get the output of the command!\n"); + latch.countDown(); + return Boolean.FALSE; + } + } + try { + process.waitFor(); + } + catch (InterruptedException e) { + e.printStackTrace(); + publish("!Cannot wait for the end of the command!\n"); + latch.countDown(); + return Boolean.FALSE; + } + try { + exitValue = process.exitValue(); + } + catch (IllegalThreadStateException e) { + e.printStackTrace(); + publish("!Cannot get the exit value of the command!\n"); + latch.countDown(); + return Boolean.FALSE; + } + if (exitValue != 0) { + publish("!Problem during the execution of the command '" + commandJoined + "'!\n"); + latch.countDown(); + return Boolean.FALSE; + } + publish("=== Ending command '" + commandJoined.trim() + "' ===\n"); + latch.countDown(); + return Boolean.TRUE; + } + + + @Override + protected void process(List chunks) { + String message = ""; + String text = logArea.getText(); + for (String chunk: chunks) { + text += chunk; + } + for (String lineSeparatedByCarriageReturn: text.split("\n")) { + for (String line: lineSeparatedByCarriageReturn.split("\r")) { + boolean progressLine = false; + if (line.matches(".*\\[=*\\s*\\]\\s*\\d*/\\d*\\s*")) { + String[] ratioElements = line.split("\\]")[1].trim().split("/"); + int current = Integer.parseInt(ratioElements[0].trim()); + int aim = Integer.parseInt(ratioElements[1].trim()); + messageField.setText(line.split("\\[")[0].trim()); + progressBar.setValue(current * 100 / aim); + etaField.setText(""); + progressLine = true; + } + else if (line.matches(".*\\[=*\\s*\\]\\s*\\d*/\\d*\\s*ETA:\\s*.*")) { + String[] ratioElements = line.split("\\]")[1].split("E")[0].trim().split("/"); + int current = Integer.parseInt(ratioElements[0].trim()); + int aim = Integer.parseInt(ratioElements[1].trim()); + String eta = line.split("ETA:")[1].trim(); + messageField.setText(line.split("\\[")[0].trim()); + progressBar.setValue(current * 100 / aim); + etaField.setText("ETA: " + eta); + progressLine = true; + } + else if (line.matches(".*\\[=*\\s*\\]\\s*\\d*\\s*completed in.*")) { + String nbElements = line.split("\\]")[1].split("completed")[0].trim(); + String timeSpent = line.split("completed in")[1].trim(); + message += line.split("\\[")[0].trim() + ": " + nbElements + " elements completed in " + timeSpent + "\n"; + messageField.setText(line.split("\\[")[0].trim()); + progressLine = true; + } + if (! progressLine) { + message += line + "\n"; + } + } + } + String lines[] = message.split("\n"); + String toBeWritten = ""; + for (int i = Math.max(0, lines.length - Global.logAreaSize); i < lines.length; i++) { + toBeWritten += lines[i] + "\n"; + } + logArea.setText(toBeWritten); + } + + public int getExitValue() { + try { + latch.await(); + } + catch (InterruptedException e) { + logArea.append("Cannot wait for the end of the process!\n"); + e.printStackTrace(); + return -1; + } + return exitValue; + } +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/ProgramOption.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/ProgramOption.java Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,358 @@ +/** + * + * Copyright INRA-URGI 2009-2010 + * + * This software is governed by the CeCILL license under French law and + * abiding by the rules of distribution of free software. You can use, + * modify and/ or redistribute the software under the terms of the CeCILL + * license as circulated by CEA, CNRS and INRIA at the following URL + * "http://www.cecill.info". + * + * As a counterpart to the access to the source code and rights to copy, + * modify and redistribute granted by the license, users are provided only + * with a limited warranty and the software's author, the holder of the + * economic rights, and the successive licensors have only limited + * liability. + * + * In this respect, the user's attention is drawn to the risks associated + * with loading, using, modifying and/or developing or reproducing the + * software by the user in light of its specific status of free software, + * that may mean that it is complicated to manipulate, and that also + * therefore means that it is reserved for developers and experienced + * professionals having in-depth computer knowledge. Users are therefore + * encouraged to load and test the software's suitability as regards their + * requirements in conditions enabling the security of their systems and/or + * data to be ensured and, more generally, to use and operate it in the + * same conditions as regards security. + * + * The fact that you are presently reading this means that you have had + * knowledge of the CeCILL license and that you accept its terms. + * + */ +import java.util.*; +import java.awt.*; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.io.*; +import javax.swing.*; +import javax.swing.filechooser.*; +import javax.swing.border.*; +import javax.swing.SwingUtilities; + + +public class ProgramOption { + boolean input; + String identifier; + String type; + String comment; + boolean compulsory; + String[] format; + String formatIdentifier; + ProgramOption associatedOption; + String defaultValue; + String[] choices; + JComponent component; + JPanel panel; + + + public ProgramOption() { + this.input = true; + this.identifier = null; + this.type = null; + this.comment = null; + this.compulsory = false; + this.format = null; + this.formatIdentifier = null; + this.associatedOption = null; + this.defaultValue = ""; + this.choices = null; + this.component = null; + this.panel = null; + } + + + public void setInput(boolean input) { + this.input = input; + } + + + public void setIdentifier(String identifier) { + this.identifier = identifier; + } + + + public void setType(String type) { + this.type = type; + } + + + public void setComment(String comment) { + this.comment = comment; + } + + + public void setCompulsory(boolean compulsory) { + this.compulsory = compulsory; + } + + + public void setFormat(String[] format) { + this.format = format; + } + + + public void setFormat(String format) { + this.format = new String[1]; + this.format[0] = format; + } + + + public void setFormatIdentifier(String formatIdentifier) { + this.formatIdentifier = formatIdentifier; + } + + + public void setAssociatedOption(ProgramOption option) { + this.associatedOption = option; + } + + + public void setChoices(String[] choices) { + this.choices = new String[choices.length+1]; + this.choices[0] = "---"; + for (int i = 0; i < choices.length; i++) { + this.choices[i+1] = choices[i]; + } + } + + + public void setDefault(String defaultValue) { + this.defaultValue = defaultValue; + } + + + public boolean isInput() { + return this.input; + } + + + public boolean checkSettings() { + if (this.identifier == null) { + return false; + } + if (this.type == null) { + return false; + } + if (this.comment == null) { + return false; + } + if (this.comment == null) { + return false; + } + if (("choice".compareToIgnoreCase(this.type) == 0) && (this.choices == null)) { + return false; + } + return true; + } + + + public JPanel getPanel() { + if (this.panel != null) { + return this.panel; + } + String comment = this.comment; + if (this.compulsory) { + comment += " [*]"; + } + + GridLayout horizontalLayout = new GridLayout(1, 0); + this.panel = new JPanel(false); + this.panel.setLayout(horizontalLayout); + JLabel label = new JLabel(comment); + + if (this.type == null) { + System.out.println("Error! Option '" + this.identifier + "' is not set!"); + } + + if (("int".compareToIgnoreCase(this.type) == 0) || ("float".compareToIgnoreCase(this.type) == 0) || ("string".compareToIgnoreCase(this.type) == 0) || (("file".compareToIgnoreCase(this.type) == 0) && (!this.input))) { + this.component = new JTextField(); + if (this.defaultValue != null) { + ((JTextField) this.component).setText(this.defaultValue); + } + label.setLabelFor(this.component); + this.panel.add(label); + this.panel.add(this.component); + } + else if ("file".compareToIgnoreCase(this.type) == 0) { + this.component = new JComboBox(Global.fileNames); + label.setLabelFor(this.component); + this.panel.add(label); + this.panel.add(this.component); + } + else if ("boolean".compareToIgnoreCase(this.type) == 0) { + this.component = new JCheckBox(); + if ((this.defaultValue != null) && (this.defaultValue.compareToIgnoreCase("true") == 0)) { + ((JCheckBox) this.component).setSelected(true); + } + label.setLabelFor(this.component); + this.panel.add(label); + this.panel.add(this.component); + } + else if ("format".compareToIgnoreCase(this.type) == 0) { + Vector < String > formats = new Vector < String > (); + for (String format: this.format) { + if (Global.formats.getFormats(format) == null) { + System.out.println("Do not know how to handle format '" + format + "'."); + } + formats.addAll(Global.formats.getFormats(format).getFormats()); + } + this.component = new JComboBox(formats); + label.setLabelFor(this.component); + this.panel.add(label); + this.panel.add(this.component); + } + else if ("files".compareToIgnoreCase(this.type) == 0) { + JButton button = new JButton("file..."); + this.component = new JTextField(); + label.setLabelFor(this.component); + this.panel.add(label); + this.panel.add(this.component); + this.panel.add(button); + Global.otherFileConcatenationChooser.put(button, (JTextField) this.component); + } + else if ("directory".compareToIgnoreCase(this.type) == 0) { + JButton button = new JButton("directory..."); + this.component = new JTextField(); + label.setLabelFor(this.component); + this.panel.add(label); + JPanel rightPanel = new JPanel(false); + rightPanel.setLayout(new BoxLayout(rightPanel, BoxLayout.LINE_AXIS)); + rightPanel.add(this.component); + rightPanel.add(button); + this.panel.add(rightPanel); + Global.otherDirectoriesChooser.put(button, (JTextField) this.component); + } + else if ("choice".compareToIgnoreCase(this.type) == 0) { + this.component = new JComboBox(this.choices); + label.setLabelFor(this.component); + this.panel.add(label); + this.panel.add(this.component); + } + else { + System.out.println("Do not know how to read type " + this.type); + } + + return this.panel; + } + + + public JComponent getComponent() { + if (component == null) { + this.getPanel(); + } + return this.component; + } + + + private String getValue() { + if (("int".equals(this.type)) || ("float".equals(this.type)) || ("string".equals(this.type)) || (("file".equals(this.type)) && (! this.input)) || ("directory".equals(this.type)) || ("files".equals(this.type))) { + String s = ((JTextField) this.component).getText(); + if ("None".equals(s)) { + return ""; + } + return s; + } + if ("file".equals(this.type)) { + return (String) ((JComboBox) this.component).getSelectedItem(); + } + if ("boolean".equals(this.type)) { + return ((JCheckBox) this.component).isSelected()? "true": "false"; + } + if ("format".equals(this.type)) { + return (String) ((JComboBox) this.component).getSelectedItem(); + } + if ("choice".equals(this.type)) { + String s = (String) ((JComboBox) this.component).getSelectedItem(); + if ("---".equals(s)) { + return ""; + } + return s; + } + System.out.println("Do not know how to get value of '" + this.type + "' (" + this.identifier + ")."); + return null; + } + + + public String checkValue() { + String value = this.getValue(); + if ((this.compulsory) && ((value == null) || ("".equals(value)))) { + return "Option '" + this.comment + "' has no value... Please specify it.\n"; + } + if ("int".equals(this.type)) { + if ((value != null) && (! "".equals(value)) && (! "None".equals(value))) { + try { + int i = Integer.parseInt(value); + } + catch (NumberFormatException e) { + return "Option '" + this.comment + "' should be an integer... Please correct it.\n"; + } + } + } + else if ("float".equals(this.type)) { + if ((value != null) && (! "".equals(value))) { + try { + float i = Float.parseFloat(value); + } + catch (NumberFormatException e) { + return "Option '" + this.comment + "' should be a float... Please correct it.\n"; + } + } + } + return null; + } + + + public LinkedList getCommand() { + LinkedList list = new LinkedList (); + + if (("int".equals(this.type)) || ("float".equals(this.type)) || ("string".equals(this.type)) || (("file".equals(this.type)) && (! this.input)) || ("format".equals(this.type)) || ("directory".equals(this.type)) || ("files".equals(this.type)) || ("choice".equals(this.type))) { + String value = this.getValue(); + if (value.length() == 0) { + return list; + } + list.add(this.identifier); + list.add(value); + return list; + } + if ("file".equals(this.type)) { + String fileName = (String) ((JComboBox) this.component).getSelectedItem(); + if (fileName == null) { + return list; + } + list.add(this.identifier); + list.add(this.getValue()); + return list; + } + if (("boolean".equals(this.type)) || ("bool".equals(this.type))) { + if ("true".equals(this.getValue())) { + list.add(this.identifier); + } + return list; + } + System.out.println("Cannot get type of option " + this.type + " (" + this.identifier + "): " + this.getValue()); + return null; + } + + + public File getOutputFile() { + if (this.input) return null; + String format = ""; + if (this.format != null) { + format = this.format[0]; + } + if (this.associatedOption != null) { + format = this.associatedOption.getValue(); + } + return new File(this.getValue(), Global.formats.getFormatType(format), format); + } +} diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/.gitignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/.gitignore Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,1 @@ +/CleanTranscriptFile.py diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/CleanTranscriptFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/CleanTranscriptFile.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,74 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from SMART.Java.Python.cleaning.CleanerChooser import CleanerChooser + + +class CleanTranscriptFile(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.chooser = CleanerChooser(self.verbosity) + + def setInputFile(self, fileName, format): + self.chooser.findFormat(format) + self.cleaner = self.chooser.getCleaner() + self.cleaner.setInputFileName(fileName) + + def setOutputFile(self, fileName): + self.cleaner.setOutputFileName(fileName) + + def setAcceptedTypes(self, types): + if types != None: + self.cleaner.setAcceptedTypes(types) + + def run(self): + self.cleaner.clean() + + +if __name__ == "__main__": + + description = "Clean Transcript File v1.0.1: Clean a transcript file so that it is useable for S-MART. [Category: Other]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-t", "--types", dest="acceptedTypes", action="store", default=None, type="string", help="name of the types you want to keep in GFF/GTF (list separated by commas) [format: string] [default: None]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + ctf = CleanTranscriptFile(options.verbosity) + ctf.setInputFile(options.inputFileName, options.format) + ctf.setOutputFile(options.outputFileName) + ctf.setAcceptedTypes(None if options.acceptedTypes == None else options.acceptedTypes.split(",")) + ctf.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ClusterizeByTags.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ClusterizeByTags.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,157 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter + + +OPERATIONS = ("diff", "div") +BOOLTOSTRANDS = {True: [0], False: [-1, 1]} + +class ClusterizeByTags(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.connection = MySqlConnection(self.verbosity-1) + self.defautValue = None + self.maxDistance = None + self.oneStrand = False + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + writer = MySqlTranscriptWriter(self.connection, None, self.verbosity) + writer.addTranscriptList(parser) + writer.write() + self.transcriptTables = writer.getTables() + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def setTag(self, tagName, defaultValue): + self.tagName = tagName + self.defaultValue = defaultValue + + def setThreshold(self, threshold): + self.threshold = threshold + + def setOperation(self, operation): + self.operation = operation + if self.operation not in OPERATIONS: + raise Exception("Operation '%s' unsupported: choose among %s" % (self.operation, ", ".join(OPERATIONS))) + + def setMaxDistance(self, distance): + self.maxDistance = distance + + def setOneStrand(self, oneStrand): + self.oneStrand = oneStrand + + def run(self): + for chromosome in sorted(self.transcriptTables.keys()): + progress = Progress(self.transcriptTables[chromosome].getNbElements(), "Analyzing %s" % (chromosome), self.verbosity) + for strand in BOOLTOSTRANDS[self.oneStrand]: + previousValue = None + previousTrend = None + previousTranscript = None + sumValue = 0 + command = "SELECT * FROM %s" % (self.transcriptTables[chromosome].getName()) + if not self.oneStrand: + command += " WHERE direction = %d" % (strand) + command += " ORDER BY start, end" + for index, transcript in self.transcriptTables[chromosome].selectTranscripts(command): + if self.tagName in transcript.getTagNames(): + value = transcript.getTagValue(self.tagName) + else: + value = self.defaultValue + if previousValue == None: + trend = None + else: + if self.operation == "diff": + trend = value - previousValue + else: + trend = value / previousValue + if previousTranscript == None: + sumValue = value + elif (previousTrend == None or abs(trend - previousTrend) <= self.threshold) and (self.maxDistance == None or previousTranscript.getDistance(transcript) <= self.maxDistance) and (previousTranscript.getDirection() == transcript.getDirection() or not self.oneStrand): + if previousTranscript.getDirection() != transcript.getDirection(): + transcript.reverse() + previousTranscript.merge(transcript) + transcript = previousTranscript + sumValue += value + previousTrend = trend + else: + previousTranscript.setTagValue(self.tagName, sumValue) + self.writer.addTranscript(previousTranscript) + sumValue = value + previousTrend = None + previousValue = value + previousTranscript = transcript + progress.inc() + if previousTranscript != None: + previousTranscript.setTagValue(self.tagName, sumValue) + self.writer.addTranscript(previousTranscript) + progress.done() + self.writer.close() + + +if __name__ == "__main__": + + description = "Clusterize By Tags v1.0.1: Clusterize a set of element using their tag values. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-t", "--tag", dest="tagName", action="store", type="string", help="name of the tag [format: string] [compulsory]") + parser.add_option("-e", "--default", dest="defaultValue", action="store", default=None, type="int", help="default value for the tag [format: string]") + parser.add_option("-r", "--threshold", dest="threshold", action="store", type="int", help="threshold between two consecutive tags [format: int] [compulsory]") + parser.add_option("-p", "--operation", dest="operation", action="store", type="string", help="operation to apply between 2 different clusters to compare them [format: choice (diff, div)] [compulsory]") + parser.add_option("-d", "--distance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance for 2 clusters to be merged [format: int] [default: None]") + parser.add_option("-1", "--oneStrand", dest="oneStrand", action="store_true", default=False, help="also cluster the elements which are on different strands [format: bool] [default: False]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cbt = ClusterizeByTags(options.verbosity) + cbt.setInputFile(options.inputFileName, options.format) + cbt.setOutputFile(options.outputFileName) + cbt.setTag(option.tagName, option.defaultValue) + cbt.setThreshold(option.threshold) + cbt.setOperation(option.operation) + cbt.setMaxDistance(operation.maxDistance) + cbt.setOneStrand(operation.oneStrand) + cbt.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/CollapseReads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/CollapseReads.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,174 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +from optparse import OptionParser, OptionGroup +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress + + +class CollapseReads(object): + """ + Merge two reads if they have exactly the same genomic coordinates + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.inputReader = None + self.outputWriter = None + self.strands = True + self.nbRead = 0 + self.nbWritten = 0 + self.nbMerges = 0 + self.splittedFileNames = {} + + def __del__(self): + for fileName in self.splittedFileNames.values(): + os.remove(fileName) + + def close(self): + self.outputWriter.close() + + def setInputFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format, "transcript") + self.parser = parserChooser.getParser(fileName) + self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setOutputFile(self, fileName): + self.outputWriter = Gff3Writer(fileName, self.verbosity) + + def getNbElements(self): + return self.parser.getNbTranscripts() + + def _sortFile(self): + fs = FileSorter(self.parser, self.verbosity-4) + fs.perChromosome(True) + fs.setOutputFileName(self.sortedFileName) + fs.sort() + self.splittedFileNames = fs.getOutputFileNames() + self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self.nbRead = fs.getNbElements() + + def _iterate(self, chromosome): + progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) + transcripts = [] + parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) + for newTranscript in parser.getIterator(): + newTranscripts = [] + for oldTranscript in transcripts: + if self._checkOverlap(newTranscript, oldTranscript): + self._merge(newTranscript, oldTranscript) + elif self._checkPassed(newTranscript, oldTranscript): + self._write(oldTranscript) + else: + newTranscripts.append(oldTranscript) + newTranscripts.append(newTranscript) + transcripts = newTranscripts + progress.inc() + for transcript in transcripts: + self._write(transcript) + progress.done() + + def _merge(self, transcript1, transcript2): + self.nbMerges += 1 + transcript2.setDirection(transcript1.getDirection()) + transcript1.merge(transcript2) + + def _write(self, transcript): + self.nbWritten += 1 + self.outputWriter.addTranscript(transcript) + + def _checkOverlap(self, transcript1, transcript2): + if transcript1.getStart() != transcript2.getStart() or transcript1.getEnd() != transcript2.getEnd(): + return False + return (not self.strands or transcript1.getDirection() == transcript2.getDirection()) + + def _checkPassed(self, transcript1, transcript2): + return (transcript2.getStart() < transcript1.getStart()) + + def collapseChromosome(self, chromosome): + progress = Progress(table.getNbElements(), "Analysing chromosome %s" % (chromosome), self.verbosity) + command = "SELECT * FROM %s ORDER BY start ASC, end DESC" % (table.name) + transcriptStart = None + transcriptEnd = None + transcriptDirection = None + currentTranscript = None + if self.strands: + command += ", direction" + for index, transcript in table.selectTranscripts(command, True): + self.nbRead += 1 + if not self.strands: + transcript.setDirection("+") + if transcriptStart != transcript.getStart() or transcriptEnd != transcript.getEnd() or transcriptDirection != transcript.getDirection(): + self.writeTranscript(currentTranscript) + transcriptStart = transcript.getStart() + transcriptEnd = transcript.getEnd() + transcriptDirection = transcript.getDirection() + currentTranscript = transcript + else: + currentTranscript.setTagValue("nbElements", (currentTranscript.getTagValue("nbElements") + 1) if "nbElements" in currentTranscript.getTagNames() else 1) + progress.inc() + self.writeTranscript(currentTranscript) + progress.done() + + def collapse(self): + self._sortFile() + for chromosome in sorted(self.nbElementsPerChromosome.keys()): + self._iterate(chromosome) + self.outputWriter.close() + if self.verbosity > 1: + print "# reads read: %d" % (self.nbRead) + print "# reads written: %d (%.2f%%)" % (self.nbWritten, float(self.nbWritten) / self.nbRead * 100) + print "# reads merges: %d" % (self.nbMerges) + +if __name__ == "__main__": + + # parse command line + description = "Collapse Reads v1.0.3: Merge two reads if they have exactly the same genomic coordinates. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in mapping format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: mapping file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="merge elements on 2 different strands [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + collapser = CollapseReads(options.verbosity) + collapser.setInputFile(options.inputFileName, options.format) + collapser.setOutputFile(options.outputFileName) + collapser.strands = not options.strands + collapser.collapse() + collapser.close() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/CombineTags.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/CombineTags.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,115 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import random +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer + +OPERATIONS = ("plus", "minus", "times", "div") + +class CombineTags(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + + def setInputFile(self, fileName, format): + self.inputFileName = fileName + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format, "transcript") + self.parser = parserChooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.outputWriter = Gff3Writer(fileName, self.verbosity) + + def setTags(self, tag1, tag2, outputTag, defaultValue = None): + self.tag1 = tag1 + self.tag2 = tag2 + self.outputTag = outputTag + self.defaultValue = defaultValue + + def setOperation(self, operation): + self.operation = operation + if self.operation not in OPERATIONS: + raise Exception("Do no handle operation %s, only: %s" % (self.operation, ", ".join(OPERATIONS))) + + def run(self): + progress = Progress(self.parser.getNbTranscripts(), "Printing transcripts %s" % (self.inputFileName), self.verbosity) + for transcript in self.parser.getIterator(): + tag1 = transcript.getTagValue(self.tag1) + tag2 = transcript.getTagValue(self.tag2) + if tag1 == None or tag2 == None: + if self.defaultValue == None: + raise Exception("Transcript %s misses one of the tags %s and %s, and has no default value !" % (transcript, self.tag1, self.tag2)) + newTag = self.defaultValue + else: + tag1, tag2 = float(tag1), float(tag2) + if self.operation == "plus": + newTag = tag1 + tag2 + elif self.operation == "minus": + newTag = tag1 - tag2 + elif self.operation == "times": + newTag = tag1 * tag2 + elif self.operation == "div": + newTag = tag1 / tag2 + transcript.setTagValue(self.outputTag, newTag) + self.outputWriter.addTranscript(transcript) + progress.inc() + progress.done() + self.parser.close() + self.outputWriter.close() + + +if __name__ == "__main__": + + # parse command line + description = "Change Tag Name v1.0.1: Change the name of tag of a list of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-t", "--tag1", dest="tag1", action="store", type="string", help="name of the first tag [compulsory] [format: string]") + parser.add_option("-T", "--tag2", dest="tag2", action="store", type="string", help="name of the second tag [compulsory] [format: string]") + parser.add_option("-d", "--default", dest="defaultValue", action="store", default=None, type="string", help="default value when one of the tag is absent [compulsory] [format: float]") + parser.add_option("-n", "--new", dest="newTag", action="store", type="string", help="name of the new tag [compulsory] [format: string]") + parser.add_option("-p", "--operation", dest="operation", action="store", type="string", help="operation combining the tags [compulsory] [format: choice (plus, minus, times, div)]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + combiner = CombineTags(options.verbosity) + combiner.setInputFile(options.inputFileName, options.inputFormat) + combiner.setOutputFile("%s.gff3" % (options.outputFileName)) + combiner.setTags(options.tag1, options.tag2, options.newTag, options.defaultValue) + combiner.setOperation(options.operation) + combiner.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/CompareOverlapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/CompareOverlapping.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,491 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, struct, time, random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.NCListHandler import NCListHandler +from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc import Utils +try: + import cPickle as pickle +except: + import pickle + +REFERENCE = 0 +QUERY = 1 +TYPES = (REFERENCE, QUERY) +TYPETOSTRING = {0: "reference", 1: "query"} + +class CompareOverlapping(object): + + def __init__(self, verbosity = 1): + self._outputFileName = "outputOverlaps.gff3" + self._iWriter = None + self._nbOverlappingQueries = 0 + self._nbOverlaps = 0 + self._nbLines = {REFERENCE: 0, QUERY: 0} + self._verbosity = verbosity + self._ncLists = {} + self._cursors = {} + self._splittedFileNames = {} + self._nbElements = {} + self._nbElementsPerChromosome = {} + self._inputFileNames = {REFERENCE: None, QUERY: None} + self._inputFileFormats = {REFERENCE: None, QUERY: None} + self._starts = {REFERENCE: None, QUERY: None} + self._ends = {REFERENCE: None, QUERY: None} + self._fivePrimes = {REFERENCE: None, QUERY: None} + self._threePrimes = {REFERENCE: None, QUERY: None} + self._ncListHandlers = {REFERENCE: None, QUERY: None} + self._convertedFileNames = {REFERENCE: False, QUERY: False} + self._sorted = False + self._index = False + self._introns = False + self._antisense = False + self._colinear = False + self._invert = False + self._distance = 0 + self._minOverlap = 1 + self._pcOverlap = None + self._included = False + self._including = False + self._outputNotOverlapping = False + self._tmpRefFileName = None + self._currentQueryTranscript = None + self._currentOrQueryTranscript = None + self._currentExQueryTranscript = None + self._randInt = random.randint(0, 100000) + + def __del__(self): + for fileName in [self._tmpRefFileName] + self._convertedFileNames.values(): + if fileName != None and os.path.exists(fileName): + os.remove(fileName) + + def close(self): + self._iWriter.close() + + def setInput(self, fileName, format, type): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._inputFileNames[type] = fileName + self._inputFileFormats[type] = format + + def setOutput(self, outputFileName): + if outputFileName != '': + self._outputFileName = outputFileName + self._iWriter = Gff3Writer(self._outputFileName) + + def setSorted(self, sorted): + self._sorted = sorted + + def setIndex(self, index): + self._index = index + + def restrictToStart(self, distance, type): + self._starts[type] = distance + + def restrictToEnd(self, distance, type): + self._ends[type] = distance + + def extendFivePrime(self, distance, type): + self._fivePrimes[type] = distance + + def extendThreePrime(self, distance, type): + self._threePrimes[type] = distance + + def acceptIntrons(self, boolean): + self._introns = boolean + + def getAntisenseOnly(self, boolean): + self._antisense = boolean + + def getColinearOnly(self, boolean): + self._colinear = boolean + + def getInvert(self, boolean): + self._invert = boolean + + def setMaxDistance(self, distance): + self._distance = distance + + def setMinOverlap(self, overlap): + self._minOverlap = overlap + + def setPcOverlap(self, overlap): + self._pcOverlap = overlap + + def setIncludedOnly(self, boolean): + self._included = boolean + + def setIncludingOnly(self, boolean): + self._including = boolean + + def includeNotOverlapping(self, boolean): + self._outputNotOverlapping = boolean + + def transformTranscript(self, transcript, type): + if self._starts[type] != None: + transcript.restrictStart(self._starts[type]) + if self._ends[type] != None: + transcript.restrictEnd(self._ends[type]) + if self._fivePrimes[type] != None: + transcript.extendStart(self._fivePrimes[type]) + if self._threePrimes[type] != None: + transcript.extendEnd(self._threePrimes[type]) + if self._introns: + transcript.exons = [] + if type == REFERENCE and self._distance > 0: + transcript.extendExons(self._distance) + return transcript + + def extendQueryTranscript(self, transcript): + self._currentExQueryTranscript = Transcript() + self._currentExQueryTranscript.copy(transcript) + if self._fivePrimes[QUERY] != None: + self._currentExQueryTranscript.extendStart(self._fivePrimes[QUERY]) + if self._threePrimes[QUERY] != None: + self._currentExQueryTranscript.extendEnd(self._threePrimes[QUERY]) + transcript.exons = [] + + def createTmpRefFile(self): + self._tmpRefFileName = "tmp_ref_%d.pkl" % (self._randInt) + if "SMARTTMPPATH" in os.environ: + self._tmpRefFileName = os.path.join(os.environ["SMARTTMPPATH"], self._tmpRefFileName) + chooser = ParserChooser(self._verbosity) + chooser.findFormat(self._inputFileFormats[REFERENCE]) + parser = chooser.getParser(self._inputFileNames[REFERENCE]) + writer = NCListFilePickle(self._tmpRefFileName, self._verbosity) + for transcript in parser.getIterator(): + transcript = self.transformTranscript(transcript, REFERENCE) + writer.addTranscript(transcript) + writer.close() + self._inputFileNames[REFERENCE] = self._tmpRefFileName + self._inputFileFormats[REFERENCE] = "pkl" + + def createNCLists(self): + self._ncLists = dict([type, {}] for type in TYPES) + self._indices = dict([type, {}] for type in TYPES) + self._cursors = dict([type, {}] for type in TYPES) + for type in TYPES: + if self._verbosity > 2: + print "Creating %s NC-list..." % (TYPETOSTRING[type]) + self._convertedFileNames[type] = "%s_%d_%d.ncl" % (self._inputFileNames[type], self._randInt, type) + ncLists = ConvertToNCList(self._verbosity) + ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type]) + ncLists.setOutputFileName(self._convertedFileNames[type]) + ncLists.setSorted(self._sorted) + if type == REFERENCE and self._index: + ncLists.setIndex(True) + ncLists.run() + self._ncListHandlers[type] = NCListHandler(self._verbosity) + self._ncListHandlers[type].setFileName(self._convertedFileNames[type]) + self._ncListHandlers[type].loadData() + self._nbLines[type] = self._ncListHandlers[type].getNbElements() + self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome() + self._ncLists[type] = self._ncListHandlers[type].getNCLists() + for chromosome, ncList in self._ncLists[type].iteritems(): + self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity) + if type == REFERENCE and self._index: + self._indices[REFERENCE][chromosome] = ncList.getIndex() + if self._verbosity > 2: + print " ...done" + + def compare(self): + nbSkips, nbMoves = 0, 0 + previousChromosome = None + done = False + refNCList = None + queryNCList = None + startTime = time.time() + progress = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity) + for chromosome, queryNCList in self._ncLists[QUERY].iteritems(): + queryParser = self._ncListHandlers[QUERY].getParser(chromosome) + queryNCList = self._ncLists[QUERY][chromosome] + queryCursor = self._cursors[QUERY][chromosome] + if chromosome != previousChromosome: + skipChromosome = False + previousChromosome = chromosome + if chromosome not in self._ncLists[REFERENCE]: + if self._outputNotOverlapping: + while not queryCursor.isOut(): + self._currentQueryTranscript = queryCursor.getTranscript() + self._writeIntervalInNewGFF3({}) + if queryCursor.hasChildren(): + queryCursor.moveDown() + else: + queryCursor.moveNext() + progress.inc() + continue + refNCList = self._ncLists[REFERENCE][chromosome] + refCursor = self._cursors[REFERENCE][chromosome] + while True: + self._currentOrQueryTranscript = queryCursor.getTranscript() + self._currentQueryTranscript = Transcript() + self._currentQueryTranscript.copy(self._currentOrQueryTranscript) + self._currentQueryTranscript = self.transformTranscript(self._currentQueryTranscript, QUERY) + self.extendQueryTranscript(self._currentOrQueryTranscript) + newRefLaddr = self.checkIndex(refCursor) + if newRefLaddr != None: + nbMoves += 1 + refCursor.setLIndex(newRefLaddr) + done = False + refCursor, done, unmatched = self.findOverlapIter(refCursor, done) + if refCursor.isOut(): + if not self._invert and not self._outputNotOverlapping: + break + if (unmatched and not self._invert and not self._outputNotOverlapping) or not queryCursor.hasChildren(): + queryCursor.moveNext() + nbSkips += 1 + else: + queryCursor.moveDown() + if queryCursor.isOut(): + break + progress.inc() + progress.done() + endTime = time.time() + self._timeSpent = endTime - startTime + if self._verbosity >= 10: + print "# skips: %d" % (nbSkips) + print "# moves: %d" % (nbMoves) + + def findOverlapIter(self, cursor, done): + chromosome = self._currentQueryTranscript.getChromosome() + matched = False + if chromosome not in self._ncLists[REFERENCE]: + return None, False, True + ncList = self._ncLists[REFERENCE][chromosome] + overlappingNames = {} + nextDone = False + firstOverlapLAddr = NCListCursor(cursor) + firstOverlapLAddr.setLIndex(-1) + if cursor.isOut(): + self._writeIntervalInNewGFF3(overlappingNames) + return firstOverlapLAddr, False, True + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + firstParentAfter = False + while not parentCursor.isOut(): + if self.isOverlapping(parentCursor) == 0: + matched = True + if self._checkOverlap(parentCursor.getTranscript()): + overlappingNames.update(self._extractID(parentCursor.getTranscript())) + if firstOverlapLAddr.isOut(): + firstOverlapLAddr.copy(parentCursor) + nextDone = True + elif self.isOverlapping(parentCursor) == 1: + firstParentAfter = NCListCursor(parentCursor) + parentCursor.moveUp() + if firstParentAfter: + written = self._writeIntervalInNewGFF3(overlappingNames) + return firstParentAfter, False, not written if self._invert else not matched + #This loop finds the overlaps with currentRefLAddr.# + while True: + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + #In case: Query is on the right of the RefInterval and does not overlap. + overlap = self.isOverlapping(cursor) + if overlap == -1: + cursor.moveNext() + #In case: Query overlaps with RefInterval. + elif overlap == 0: + matched = True + if self._checkOverlap(cursor.getTranscript()): + overlappingNames.update(self._extractID(cursor.getTranscript())) + if firstOverlapLAddr.compare(parentCursor): + firstOverlapLAddr.copy(cursor) + nextDone = True + if done: + cursor.moveNext() + else: + if not cursor.hasChildren(): + cursor.moveNext() + if cursor.isOut(): + break + else: + cursor.moveDown() + #In case: Query is on the left of the RefInterval and does not overlap. + else: + if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor): + firstOverlapLAddr.copy(cursor) + nextDone = False # new + break + + done = False + if cursor.isOut(): + break + written = self._writeIntervalInNewGFF3(overlappingNames) + return firstOverlapLAddr, nextDone, not written if self._invert else not matched + + def isOverlapping(self, refTranscript): + if (self._currentExQueryTranscript.getStart() <= refTranscript.getEnd() and self._currentExQueryTranscript.getEnd() >= refTranscript.getStart()): + return 0 + if self._currentExQueryTranscript.getEnd() < refTranscript.getStart(): + return 1 + return -1 + + def checkIndex(self, cursor): + if not self._index: + return None + if cursor.isOut(): + return None + chromosome = self._currentExQueryTranscript.getChromosome() + nextLIndex = self._indices[REFERENCE][chromosome].getIndex(self._currentExQueryTranscript) + if nextLIndex == None: + return None + ncList = self._ncLists[REFERENCE][chromosome] + nextGffAddress = ncList.getRefGffAddr(nextLIndex) + thisGffAddress = cursor.getGffAddress() + if nextGffAddress > thisGffAddress: + return nextLIndex + return None + + def _writeIntervalInNewGFF3(self, names): + nbOverlaps = 0 + for cpt in names.values(): + nbOverlaps += cpt + self._nbOverlappingQueries += 1 if Utils.xor(names, self._invert) else 0 + self._nbOverlaps += nbOverlaps if Utils.xor(names, self._invert) else 0 + if names: + self._currentQueryTranscript.setTagValue("overlapWith", ",".join(names)) + self._currentQueryTranscript.setTagValue("nbOverlaps", nbOverlaps) + if self._invert: + return False + else: + if self._outputNotOverlapping: + self._currentQueryTranscript.setTagValue("nbOverlaps", 0) + elif not self._invert: + return False + self._iWriter.addTranscript(self._currentQueryTranscript) + self._iWriter.write() + return True + + def _extractID(self, transcript): + id = transcript.getTagValue("ID") if "ID" in transcript.getTagNames() else transcript.getUniqueName() + nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1 + return {id: float(nbElements)} + + def _checkOverlap(self, refTranscript): + if self._currentQueryTranscript.getDistance(refTranscript) > self._distance: + return False + minOverlap = self._minOverlap + if self._pcOverlap != None: + minOverlap = max(self._minOverlap, self._currentQueryTranscript.getSize() / 100.0 * self._pcOverlap) + if not self._currentQueryTranscript.overlapWith(refTranscript, minOverlap): + return False + if self._antisense and self._currentQueryTranscript.getDirection() == refTranscript.getDirection(): + return False + if self._colinear and self._currentQueryTranscript.getDirection() != refTranscript.getDirection(): + return False + if self._included and not refTranscript.include(self._currentQueryTranscript): + return False + if self._including and not self._currentQueryTranscript.include(refTranscript): + return False + if self._introns: + return True + return self._currentQueryTranscript.overlapWithExon(refTranscript, minOverlap) + + def run(self): + self.createTmpRefFile() + self.createNCLists() + self.compare() + self.close() + if self._verbosity > 0: + print "# queries: %d" % (self._nbLines[QUERY]) + print "# refs: %d" % (self._nbLines[REFERENCE]) + print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) + print "time: %ds" % (self._timeSpent) + + +if __name__ == "__main__": + description = "Compare Overlapping v1.0.4: Get the data which overlap with a reference set. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-D", "--index", dest="index", action="store_true", default=False, help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]") + parser.add_option("-r", "--sorted", dest="sorted", action="store_true", default=False, help="input files are already sorted [format: boolean] [default: False]") + parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]") + parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]") + parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]") + parser.add_option("-u", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]") + parser.add_option("-t", "--intron", dest="introns", action="store_true", default=False, help="also report introns [format: bool] [default: false]") + parser.add_option("-E", "--5primeExtension1", dest="fivePrime1", action="store", default=None, type="int", help="extension towards 5' in file 1 [format: int]") + parser.add_option("-e", "--5primeExtension2", dest="fivePrime2", action="store", default=None, type="int", help="extension towards 5' in file 2 [format: int]") + parser.add_option("-N", "--3primeExtension1", dest="threePrime1", action="store", default=None, type="int", help="extension towards 3' in file 1 [format: int]") + parser.add_option("-n", "--3primeExtension2", dest="threePrime2", action="store", default=None, type="int", help="extension towards 3' in file 2 [format: int]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]") + parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]") + parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=1, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]") + parser.add_option("-p", "--pcOverlap", dest="pcOverlap", action="store", default=None, type="int", help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + co = CompareOverlapping(options.verbosity) + co.setInput(options.inputFileName1, options.format1, QUERY) + co.setInput(options.inputFileName2, options.format2, REFERENCE) + co.setOutput(options.output) + co.setSorted(options.sorted) + co.setIndex(options.index) + co.restrictToStart(options.start1, QUERY) + co.restrictToStart(options.start2, REFERENCE) + co.restrictToEnd(options.end1, QUERY) + co.restrictToEnd(options.end2, REFERENCE) + co.extendFivePrime(options.fivePrime1, QUERY) + co.extendFivePrime(options.fivePrime2, REFERENCE) + co.extendThreePrime(options.threePrime1, QUERY) + co.extendThreePrime(options.threePrime2, REFERENCE) + co.acceptIntrons(options.introns) + co.getAntisenseOnly(options.antisense) + co.getColinearOnly(options.colinear) + co.getInvert(options.exclude) + co.setMaxDistance(options.distance) + co.setMinOverlap(options.minOverlap) + co.setPcOverlap(options.pcOverlap) + co.setIncludedOnly(options.included) + co.setIncludingOnly(options.including) + co.includeNotOverlapping(options.notOverlapping) + co.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/CompareOverlappingSmallQuery.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/CompareOverlappingSmallQuery.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,226 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +MINBIN = 3 +MAXBIN = 7 +REFERENCE = 0 +QUERY = 1 + +def getBin(start, end): + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)) + return int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + +def getOverlappingBins(start, end): + array = [] + bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +class CompareOverlappingSmallQuery(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.tableNames = {} + self.nbQueries = 0 + self.nbRefs = 0 + self.nbWritten = 0 + self.nbOverlaps = 0 + self.distance = None + self.invert = False + self.antisense = False + self.collinear = False + self.bins = {} + self.overlaps = {} + self.notOverlapping = False + + def setReferenceFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.refParser = chooser.getParser(fileName) + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def setDistance(self, distance): + self.distance = distance + + def setInvert(self, boolean): + self.invert = boolean + + def setCollinear(self, boolean): + self.collinear = boolean + + def setAntisense(self, boolean): + self.antisense = boolean + + def includeNotOverlapping(self, boolean): + self.notOverlapping = boolean + + def loadQuery(self): + progress = UnlimitedProgress(10000, "Reading queries", self.verbosity) + for transcript in self.queryParser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + bin = getBin(transcript.getStart(), transcript.getEnd()) + if chromosome not in self.bins: + self.bins[chromosome] = {} + if bin not in self.bins[chromosome]: + self.bins[chromosome][bin] = [] + self.bins[chromosome][bin].append(transcript) + if self.notOverlapping or self.invert: + self.overlaps[transcript] = {} + self.nbQueries += 1 + progress.inc() + progress.done() + + def _compareTwoTranscripts(self, queryTranscript, refTranscript): + if not queryTranscript.overlapWithExon(refTranscript): + return False + if self.collinear and queryTranscript.getDirection() != refTranscript.getDirection(): + return False + if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection(): + return False + return True + + def _alterTranscript(self, transcript, type): + if type == REFERENCE: + if self.distance != None: + transcript.extendExons(self.distance) + return transcript + + def _compareTranscript(self, refTranscript): + refChromosome = refTranscript.getChromosome() + if refChromosome not in self.bins: + return [] + refStart = refTranscript.getStart() + refEnd = refTranscript.getEnd() + bins = getOverlappingBins(refStart, refEnd) + for binRange in bins: + for bin in range(binRange[0], binRange[1]+1): + if bin not in self.bins[refChromosome]: + continue + for queryTranscript in self.bins[refChromosome][bin]: + if self._compareTwoTranscripts(queryTranscript, refTranscript): + if queryTranscript not in self.overlaps: + self.overlaps[queryTranscript] = {} + nbElements = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + self.overlaps[queryTranscript][refTranscript.getName()] = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + self.nbOverlaps += nbElements + + def _updateTranscript(self, queryTranscript): + overlaps = self.overlaps[queryTranscript] + queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values())) + if overlaps: + queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100]) + return queryTranscript + + def compare(self): + progress = UnlimitedProgress(10000, "Comparing references", self.verbosity) + for refTranscript in self.refParser.getIterator(): + if refTranscript.__class__.__name__ == "Mapping": + refTranscript = refTranscript.getTranscript() + refTranscript = self._alterTranscript(refTranscript, REFERENCE) + self._compareTranscript(refTranscript) + self.nbRefs += 1 + progress.inc() + progress.done() + + def printResults(self): + for transcript in self.overlaps: + if not self.invert or not self.overlaps[transcript]: + if not self.invert: + transcript = self._updateTranscript(transcript) + self.writer.addTranscript(transcript) + self.nbWritten += 1 + self.writer.close() + + def displayResults(self): + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps) + + def run(self): + self.loadQuery() + self.compare() + self.printResults() + self.displayResults() + +if __name__ == "__main__": + + description = "Compare Overlapping Small Query v1.0.1: Provide the queries that overlap with a reference, when the query is small. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cosq = CompareOverlappingSmallQuery(options.verbosity) + cosq.setQueryFile(options.inputFileName1, options.format1) + cosq.setReferenceFile(options.inputFileName2, options.format2) + cosq.setOutputFile(options.outputFileName) + cosq.includeNotOverlapping(options.notOverlapping) + cosq.setDistance(options.distance) + cosq.setCollinear(options.collinear) + cosq.setAntisense(options.antisense) + cosq.setInvert(options.exclude) + cosq.run() + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/CompareOverlappingSmallRef.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/CompareOverlappingSmallRef.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,217 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +MINBIN = 3 +MAXBIN = 7 +REFERENCE = 0 +QUERY = 1 + +def getBin(start, end): + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)) + return int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + +def getOverlappingBins(start, end): + array = [] + bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +class CompareOverlappingSmallRef(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.tableNames = {} + self.nbQueries = 0 + self.nbRefs = 0 + self.nbWritten = 0 + self.nbOverlaps = 0 + self.invert = False + self.antisense = False + self.collinear = False + self.distance = None + self.bins = {} + self.notOverlapping = False + + def setReferenceFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.refParser = chooser.getParser(fileName) + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def setDistance(self, distance): + self.distance = distance + + def setCollinear(self, boolean): + self.collinear = boolean + + def setAntisense(self, boolean): + self.antisense = boolean + + def setInvert(self, boolean): + self.invert = boolean + + def includeNotOverlapping(self, boolean): + self.notOverlapping = boolean + + def loadRef(self): + progress = UnlimitedProgress(10000, "Reading references", self.verbosity) + for transcript in self.refParser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + transcript = self._alterTranscript(transcript, REFERENCE) + chromosome = transcript.getChromosome() + bin = getBin(transcript.getStart(), transcript.getEnd()) + if chromosome not in self.bins: + self.bins[chromosome] = {} + if bin not in self.bins[chromosome]: + self.bins[chromosome][bin] = [] + self.bins[chromosome][bin].append(transcript) + self.nbRefs += 1 + progress.inc() + progress.done() + + def _alterTranscript(self, transcript, type): + if type == REFERENCE: + if self.distance != None: + transcript.extendExons(self.distance) + return transcript + + def _compareTwoTranscripts(self, queryTranscript, refTranscript): + if not queryTranscript.overlapWithExon(refTranscript): + return False + if self.collinear and queryTranscript.getDirection() != refTranscript.getDirection(): + return False + if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection(): + return False + return True + + def _compareTranscript(self, queryTranscript): + queryChromosome = queryTranscript.getChromosome() + if queryChromosome not in self.bins: + return [] + queryStart = queryTranscript.getStart() + queryEnd = queryTranscript.getEnd() + bins = getOverlappingBins(queryStart, queryEnd) + overlaps = {} + for binRange in bins: + for bin in range(binRange[0], binRange[1]+1): + if bin not in self.bins[queryChromosome]: + continue + for refTranscript in self.bins[queryChromosome][bin]: + if self._compareTwoTranscripts(queryTranscript, refTranscript): + nbElements = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + overlaps[refTranscript.getName()] = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + self.nbOverlaps += nbElements + return overlaps + + def _updateTranscript(self, queryTranscript, overlaps): + queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values())) + if overlaps: + queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100]) + return queryTranscript + + def compare(self): + progress = UnlimitedProgress(10000, "Comparing queries", self.verbosity) + for queryTranscript in self.queryParser.getIterator(): + if queryTranscript.__class__.__name__ == "Mapping": + queryTranscript = queryTranscript.getTranscript() + progress.inc() + self.nbQueries += 1 + overlaps = self._compareTranscript(queryTranscript) + if self.notOverlapping or (overlaps and not self.invert) or (not overlaps and self.invert): + if not self.invert: + queryTranscript = self._updateTranscript(queryTranscript, overlaps) + self.writer.addTranscript(queryTranscript) + self.nbWritten += 1 + progress.done() + self.writer.close() + + def displayResults(self): + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps) + + def run(self): + self.loadRef() + self.compare() + self.displayResults() + +if __name__ == "__main__": + + description = "Compare Overlapping Small Reference v1.0.1: Provide the queries that overlap with a reference, when the reference is small. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cosr = CompareOverlappingSmallRef(options.verbosity) + cosr.setQueryFile(options.inputFileName1, options.format1) + cosr.setReferenceFile(options.inputFileName2, options.format2) + cosr.setOutputFile(options.outputFileName) + cosr.includeNotOverlapping(options.notOverlapping) + cosr.setDistance(options.distance) + cosr.setAntisense(options.antisense) + cosr.setInvert(options.exclude) + cosr.setInvert(options.exclude) + cosr.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ComputeCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ComputeCoverage.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,142 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, random +from optparse import OptionParser, OptionGroup +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.Gff3Writer import Gff3Writer + + +class CoverageComputer(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.queryReader = None + self.referenceReader = None + self.outputWriter = None + self.introns = False + self.nbNucleotides = 0 + self.nbCovered = 0 + + def setInputQueryFile(self, fileName, format): + self.queryReader = TranscriptContainer(fileName, format, self.verbosity-1) + + def setInputReferenceFile(self, fileName, format): + self.referenceReader = TranscriptContainer(fileName, format, self.verbosity-1) + + def includeIntrons(self, boolean): + self.introns = boolean + + def setOutputFileName(self, fileName, title="S-MART", feature="transcript", featurePart="exon"): + self.outputWriter = Gff3Writer(fileName, self.verbosity-1) + self.outputWriter.setTitle(title) + self.outputWriter.setFeature(feature) + self.outputWriter.setFeaturePart(featurePart) + + def readReference(self): + self.coveredRegions = {} + progress = Progress(self.referenceReader.getNbTranscripts(), "Reading reference file", self.verbosity-1) + for transcript in self.referenceReader.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.coveredRegions: + self.coveredRegions[chromosome] = {} + if self.introns: + transcript.removeExons() + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + self.coveredRegions[chromosome][position] = 1 + progress.inc() + progress.done() + + def readQuery(self): + progress = Progress(self.queryReader.getNbTranscripts(), "Reading query file", self.verbosity-1) + for transcript in self.queryReader.getIterator(): + progress.inc() + chromosome = transcript.getChromosome() + if chromosome not in self.coveredRegions: + continue + if self.introns: + transcript.removeExons() + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + self.nbNucleotides += 1 + self.nbCovered += self.coveredRegions[chromosome].get(position, 0) + progress.done() + + def write(self): + progress = Progress(self.queryReader.getNbTranscripts(), "Writing output file", self.verbosity-1) + for transcript in self.queryReader.getIterator(): + chromosome = transcript.getChromosome() + if self.introns: + transcript.removeExons() + size = transcript.getSize() + coverage = 0 + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + coverage += self.coveredRegions[chromosome].get(position, 0) + transcript.setTagValue("coverage", 0 if size == 0 else float(coverage) / size * 100) + self.outputWriter.addTranscript(transcript) + progress.inc() + progress.done() + + def sumUp(self): + print "%d nucleotides in query, %d (%.f%%) covered" % (self.nbNucleotides, self.nbCovered, 0 if self.nbNucleotides == 0 else float(self.nbCovered) / self.nbNucleotides * 100) + + def run(self): + self.readReference() + self.readQuery() + if self.outputWriter != None: + self.write() + self.sumUp() + + +if __name__ == "__main__": + + # parse command line + description = "Compute Coverage v1.0.1: Compute the coverage of a set with respect to another set. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input query file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of the first file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input reference file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of the second file [compulsory] [format: transcript file format]") + parser.add_option("-t", "--introns", dest="introns", action="store_true", default=False, help="also include introns [format: boolean] [default: false]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + computer = CoverageComputer(options.verbosity) + computer.setInputQueryFile(options.inputFileName1, options.format1) + computer.setInputReferenceFile(options.inputFileName2, options.format2) + computer.includeIntrons(options.introns) + computer.setOutputFileName(options.outputFileName) + computer.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/CountReadGCPercent.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/CountReadGCPercent.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.utils.RepetOptionParser import RepetOptionParser +from Gnome_tools.CountGCPercentBySlidingWindow import CountGCPercentBySlidingWindow + + +class CountReadGCPercent(object): + + def __init__(self): + self.referenceReader = None + self.gffReader = None + self.outputWriter = None + self.verbose = 0 + + def setInputReferenceFile(self, fileName): + self.referenceReader = fileName + + def setInputGffFile(self, fileName): + self.gffReader = TranscriptContainer(fileName, 'gff3', self.verbose) + + def setOutputFileName(self, fileName): + self.outputWriter = Gff3Writer(fileName, self.verbose) + + def readGffAnnotation(self): + self.coveredRegions = {} + progress = Progress(self.gffReader.getNbTranscripts(), "Reading gff3 annotation file", self.verbose) + for transcript in self.gffReader.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.coveredRegions: + self.coveredRegions[chromosome] = {} + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + self.coveredRegions[chromosome][position] = 1 + progress.inc() + progress.done() + + def write(self): + iParser = FastaParser(self.referenceReader) + iParser.setTags() + iGetGCPercentBySW = CountGCPercentBySlidingWindow() + progress = Progress(self.gffReader.getNbTranscripts(), "Writing output file", self.verbose) + for transcript in self.gffReader.getIterator(): + chromosome = transcript.getChromosome() + GCpercent = 0 + nPercent = 0 + for exon in transcript.getExons(): + for sequenceName in iParser.getTags().keys(): + if sequenceName != chromosome: + continue + else: + subSequence = iParser.getSubSequence(sequenceName, exon.getStart() , exon.getEnd(), 1) + GCpercent, nPercent = iGetGCPercentBySW.getGCPercentAccordingToNAndNPercent(subSequence) + print "GCpercent = %f, nPercent = %f" % (GCpercent, nPercent) + transcript.setTagValue("GCpercent", GCpercent) + transcript.setTagValue("NPercent", nPercent) + self.outputWriter.addTranscript(transcript) + progress.inc() + progress.done() + + def run(self): + self.readGffAnnotation() + if self.outputWriter != None: + self.write() + +if __name__ == "__main__": + description = "Count GC percent for each read against a genome." + usage = "CountReadGCPercent.py -i -j -o -v -h]" + examples = "\nExample: \n" + examples += "\t$ python CountReadGCPercent.py -i file.fasta -j annotation.gff -o output.gff3" + examples += "\n\n" + parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples) + parser.add_option( '-i', '--inputGenome', dest='fastaFile', help='fasta file [compulsory]', default= None ) + parser.add_option( '-j', '--inputAnnotation', dest='gffFile', help='gff3 file [compulsory]', default= None) + parser.add_option( '-o', '--output', dest='outputFile', help='output gff3 file [compulsory]', default= None ) + parser.add_option( '-v', '--verbose', dest='verbose', help='verbosity level (default=0/1)',type="int", default= 0 ) + (options, args) = parser.parse_args() + + readGCPercent = CountReadGCPercent() + readGCPercent.setInputReferenceFile(options.fastaFile) + readGCPercent.setInputGffFile(options.gffFile) + readGCPercent.setOutputFileName(options.outputFile) + readGCPercent.run() + \ No newline at end of file diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/FindOverlapsOptim.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/FindOverlapsOptim.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,343 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os, struct, time, shutil +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList +from SMART.Java.Python.ncList.NCListParser import NCListParser +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.NCListHandler import NCListHandler +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +try: + import cPickle as pickle +except: + import pickle + +REFERENCE = 0 +QUERY = 1 +TYPES = (REFERENCE, QUERY) +TYPETOSTRING = {0: "reference", 1: "query"} + +class FindOverlapsOptim(object): + + def __init__(self, verbosity = 1): + self._parsers = {} + self._sortedFileNames = {} + self._outputFileName = "outputOverlaps.gff3" + self._iWriter = None + self._inputFileNames = {REFERENCE: None, QUERY: None} + self._convertedFileNames = {REFERENCE: False, QUERY: False} + self._inputFileFormats = {REFERENCE: None, QUERY: None} + self._converted = {REFERENCE: False, QUERY: False} + self._ncListHandlers = {REFERENCE: None, QUERY: None} + self._splittedFileNames = {REFERENCE: {}, QUERY: {}} + self._nbOverlappingQueries = 0 + self._nbOverlaps = 0 + self._nbLines = {REFERENCE: 0, QUERY: 0} + self._sorted = False + self._index = False + self._verbosity = verbosity + self._ncLists = {} + self._cursors = {} + self._nbElementsPerChromosome = {} + self._tmpDirectories = {REFERENCE: False, QUERY: False} + + def close(self): + self._iWriter.close() + for fileName in (self._sortedFileNames.values()): + if os.path.exists(fileName): + os.remove(fileName) + for fileName in self._convertedFileNames.values(): + if fileName: + os.remove(fileName) + + def setRefFileName(self, fileName, format): + self.setFileName(fileName, format, REFERENCE) + + def setQueryFileName(self, fileName, format): + self.setFileName(fileName, format, QUERY) + + def setFileName(self, fileName, format, type): + self._inputFileNames[type] = fileName + self._inputFileFormats[type] = format + if format.lower() != "nclist": + self._converted[type] = True + + def setOutputFileName(self, outputFileName): + self._outputFileName = outputFileName + self._iWriter = Gff3Writer(self._outputFileName) + + def setSorted(self, sorted): + self._sorted = sorted + + def setIndex(self, index): + self._index = index + + def createNCLists(self): + startTime = time.time() + if self._verbosity > 1: + print "Building database" + self._ncLists = dict([type, {}] for type in TYPES) + self._indices = dict([type, {}] for type in TYPES) + self._cursors = dict([type, {}] for type in TYPES) + for type in TYPES: + self._ncListHandlers[type] = NCListHandler(self._verbosity-3) + if self._converted[type]: + self._convertedFileNames[type] = "%s_%d.ncl" % (os.path.splitext(self._inputFileNames[type])[0], type) + ncLists = ConvertToNCList(self._verbosity-3) + ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type]) + ncLists.setSorted(self._sorted) + ncLists.setOutputFileName(self._convertedFileNames[type]) + if type == REFERENCE and self._index: + ncLists.setIndex(True) + ncLists.run() + self._ncListHandlers[type].setFileName(self._convertedFileNames[type]) + else: + self._ncListHandlers[type].setFileName(self._inputFileNames[type]) + self._ncListHandlers[type].loadData() + self._nbLines[type] = self._ncListHandlers[type].getNbElements() + self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome() + self._ncLists[type] = self._ncListHandlers[type].getNCLists() + for chromosome, ncList in self._ncLists[type].iteritems(): + self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity) + if type == REFERENCE and self._index: + self._indices[REFERENCE][chromosome] = ncList.getIndex() + endTime = time.time() + if self._verbosity > 1: + print "done (%.2gs)" % (endTime - startTime) + + def compare(self): + nbSkips, nbMoves = 0, 0 + previousChromosome = None + done = False + startTime = time.time() + progress = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity) + #print "query:", self._ncLists[QUERY].keys() + #print "reference:", self._ncLists[REFERENCE].keys() + for chromosome, queryNCList in self._ncLists[QUERY].iteritems(): + queryParser = self._ncListHandlers[QUERY].getParser(chromosome) + queryCursor = self._cursors[QUERY][chromosome] + if chromosome != previousChromosome: + skipChromosome = False + previousChromosome = chromosome + if chromosome not in self._ncLists[REFERENCE]: + #print "out ", chromosome + continue + refNCList = self._ncLists[REFERENCE][chromosome] + refCursor = self._cursors[REFERENCE][chromosome] + #print "starting", chromosome + while True: + queryTranscript = queryCursor.getTranscript() + newRefLaddr = self.checkIndex(queryTranscript, refCursor) + #print "query is", queryTranscript + if newRefLaddr != None: + nbMoves += 1 + refCursor.setLIndex(newRefLaddr) + #print "skipping to", refCursor + done = False + refCursor, done, unmatched = self.findOverlapIter(queryTranscript, refCursor, done) + #print "completed with", refCursor, done, unmatched + if refCursor.isOut(): + #print "exiting 1", chromosome + break + if unmatched or not queryCursor.hasChildren(): + queryCursor.moveNext() + #print "moving next to", queryCursor + nbSkips += 1 + else: + queryCursor.moveDown() + #print "moving down to", queryCursor + if queryCursor.isOut(): + #print "exiting 2", chromosome + break + progress.inc() + progress.done() + endTime = time.time() + self._timeSpent = endTime - startTime + if self._verbosity >= 10: + print "# skips: %d" % (nbSkips) + print "# moves: %d" % (nbMoves) + + def findOverlapIter(self, queryTranscript, cursor, done): + chromosome = queryTranscript.getChromosome() + if chromosome not in self._ncLists[REFERENCE]: + return False, None + ncList = self._ncLists[REFERENCE][chromosome] + overlappingNames = {} + nextDone = False + firstOverlapLAddr = NCListCursor(cursor) + firstOverlapLAddr.setLIndex(-1) + if cursor.isOut(): + return firstOverlapLAddr, False + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + firstParentAfter = False + #print "query transcript 1", queryTranscript + #print "cursor 1", cursor + #print "parent 1", parentCursor + while not parentCursor.isOut(): + if self.isOverlapping(queryTranscript, parentCursor) == 0: + #print "overlap parent choice 0" + overlappingNames.update(self._extractID(parentCursor.getTranscript())) + if firstOverlapLAddr.isOut(): + #print "overlap parent 2" + firstOverlapLAddr.copy(parentCursor) + nextDone = True # new + elif self.isOverlapping(queryTranscript, parentCursor) == 1: + #print "overlap parent choice 1" + firstParentAfter = NCListCursor(parentCursor) + parentCursor.moveUp() + #print "parent 2", parentCursor + if firstParentAfter: + #print "exit parent", firstParentAfter, overlappingNames + self._writeIntervalInNewGFF3(queryTranscript, overlappingNames) + return firstParentAfter, False, not overlappingNames + #This loop finds the overlaps with currentRefLAddr.# + while True: + #print "ref cursor now is", cursor + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + #In case: Query is on the right of the RefInterval and does not overlap. + overlap = self.isOverlapping(queryTranscript, cursor) + if overlap == -1: + cursor.moveNext() + #In case: Query overlaps with RefInterval. + elif overlap == 0: + #print "choice 2" + overlappingNames.update(self._extractID(cursor.getTranscript())) + if firstOverlapLAddr.compare(parentCursor): + firstOverlapLAddr.copy(cursor) + nextDone = True # new + if done: + cursor.moveNext() + else: + if not cursor.hasChildren(): + cursor.moveNext() + if cursor.isOut(): + #print "break 1" + break + else: + cursor.moveDown() + #In case: Query is on the left of the RefInterval and does not overlap. + else: + #print "choice 3" + if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor): + #print "changing nfo 2" + firstOverlapLAddr.copy(cursor) + nextDone = False # new + #print "break 2" + break + + done = False + if cursor.isOut(): + #print "break 3" + break + self._writeIntervalInNewGFF3(queryTranscript, overlappingNames) + return firstOverlapLAddr, nextDone, not overlappingNames + + def isOverlapping(self, queryTranscript, refTranscript): + if (queryTranscript.getStart() <= refTranscript.getEnd() and queryTranscript.getEnd() >= refTranscript.getStart()): + return 0 + if queryTranscript.getEnd() < refTranscript.getStart(): + return 1 + return -1 + + def checkIndex(self, transcript, cursor): + if not self._index: + return None + chromosome = transcript.getChromosome() + nextLIndex = self._indices[REFERENCE][chromosome].getIndex(transcript) + if nextLIndex == None: + return None + ncList = self._ncLists[REFERENCE][chromosome] + nextGffAddress = ncList.getRefGffAddr(nextLIndex) + thisGffAddress = cursor.getGffAddress() + if nextGffAddress > thisGffAddress: + return nextLIndex + return None + + def _writeIntervalInNewGFF3(self, transcript, names): + nbOverlaps = 0 + for cpt in names.values(): + nbOverlaps += cpt + if not names: + return + transcript.setTagValue("overlapsWith", "--".join(sorted(names.keys()))) + transcript.setTagValue("nbOverlaps", nbOverlaps) + self._iWriter.addTranscript(transcript) + self._iWriter.write() + self._nbOverlappingQueries += 1 + self._nbOverlaps += nbOverlaps + + def _extractID(self, transcript): + nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1 + id = transcript.getTagValue("ID") if "ID" in transcript.getTagNames() else transcript.getUniqueName() + return {id: nbElements} + + def run(self): + self.createNCLists() + self.compare() + self.close() + if self._verbosity > 0: + print "# queries: %d" % (self._nbLines[QUERY]) + print "# refs: %d" % (self._nbLines[REFERENCE]) + print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) + print "time: %.2gs" % (self._timeSpent) + + +if __name__ == "__main__": + description = "Find Overlaps Optim v1.0.0: Finds overlaps with several query intervals. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--query", dest="inputQueryFileName", action="store", type="string", help="query input file [compulsory] [format: file in transcript or other format given by -f]") + parser.add_option("-f", "--queryFormat", dest="queryFormat", action="store", type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]") + parser.add_option("-j", "--ref", dest="inputRefFileName", action="store", type="string", help="reference input file [compulsory] [format: file in transcript or other format given by -g]") + parser.add_option("-g", "--refFormat", dest="refFormat", action="store", type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-d", "--index", dest="index", action="store_true", default=False, help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]") + parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input files are already sorted [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + iFOO = FindOverlapsOptim(options.verbosity) + iFOO.setRefFileName(options.inputRefFileName, options.refFormat) + iFOO.setQueryFileName(options.inputQueryFileName, options.queryFormat) + iFOO.setOutputFileName(options.outputFileName) + iFOO.setIndex(options.index) + iFOO.setSorted(options.sorted) + iFOO.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/GetDifferentialExpression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/GetDifferentialExpression.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,441 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the differential expression between 2 conditions (2 files), on regions defined by a third file""" + +import os, re +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from SMART.Java.Python.structure.Transcript import Transcript + +class GetDifferentialExpression(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.mySqlConnection = MySqlConnection(verbosity) + self.inputs = (0, 1) + self.transcriptContainers = [None, None] + self.transcriptContainerRef = None + self.outputFileName = None + self.writer = None + self.tables = [None, None] + self.nbElements = [0, 0] + + self.regionsToValues = {} + self.regionsToNames = {} + self.valuesToPvalues = {} + + self.oriented = True + self.simpleNormalization = False + self.simpleNormalizationParameters = None + self.adjustedNormalization = False + self.fixedSizeFactor = None + self.normalizationSize = None + self.normalizationFactors = [1, 1] + self.fdr = None + self.fdrPvalue = None + + self.plot = False + self.plotter = None + self.plotterName = None + self.points = {} + + + def setInputFile(self, i, fileName, fileFormat): + self.transcriptContainers[i] = TranscriptContainer(fileName, fileFormat, self.verbosity) + self.transcriptContainers[i].mySqlConnection = self.mySqlConnection + + + def setReferenceFile(self, fileName, fileFormat): + self.transcriptContainerRef = TranscriptContainer(fileName, fileFormat, self.verbosity) + self.transcriptContainerRef.mySqlConnection = self.mySqlConnection + + + def setOutputFile(self, fileName): + self.outputFileName = fileName + self.writer = Gff3Writer(fileName, self.verbosity) + + + def setOriented(self, boolean): + self.oriented = boolean + + + def setSimpleNormalization(self, boolean): + self.simpleNormalization = boolean + + + def setSimpleNormalizationParameters(self, parameters): + if parameters != None: + self.simpleNormalization = True + self.simpleNormalizationParameters = [0, 0] + for i, splittedParameter in enumerate(parameters.split(",")): + self.simpleNormalizationParameters[i] = int(splittedParameter) + + + def setAdjustedNormalization(self, boolean): + self.adjustedNormalization = boolean + + + def setFixedSizeNormalization(self, value): + self.fixedSizeFactor = value + + + def setFdr(self, fdr): + self.fdr = fdr + + + def setPlot(self, boolean): + self.plot = boolean + + + def setPlotterName(self, plotterName): + self.plotterName = plotterName + + def setPlotter(self): + self.plot = True + self.plotter = RPlotter(self.plotterName, self.verbosity) + self.plotter.setPoints(True) + self.plotter.setLog("xy") + self.points = {} + + + def readInput(self, i): + self.transcriptContainers[i].storeIntoDatabase() + self.tables[i] = self.transcriptContainers[i].getTables() + progress = Progress(len(self.tables[i].keys()), "Adding indices", self.verbosity) + for chromosome in self.tables[i]: + if self.oriented: + self.tables[i][chromosome].createIndex("iStartEndDir_%s_%d" % (chromosome, i), ("start", "end", "direction")) + else: + self.tables[i][chromosome].createIndex("iStartEnd_%s_%d" % (chromosome, i), ("start", "end")) + progress.inc() + progress.done() + + progress = Progress(self.transcriptContainers[i].getNbTranscripts(), "Reading sample %d" % (i +1), self.verbosity) + for chromosome in self.tables[i]: + for transcript in self.tables[i][chromosome].getIterator(): + self.nbElements[i] += 1 if "nbElements" not in transcript.getTagNames() else transcript.getTagValue("nbElements") + progress.inc() + progress.done() + if self.verbosity > 0: + print "%d elements in sample %d" % (self.nbElements[i], i+1) + + + def computeSimpleNormalizationFactors(self): + nbElements = self.nbElements + if self.simpleNormalizationParameters != None: + print "Using provided normalization parameters: %s" % (", ".join([str(parameter) for parameter in self.simpleNormalizationParameters])) + nbElements = self.simpleNormalizationParameters + avgNbElements = int(float(sum(nbElements)) / len(nbElements)) + for i in self.inputs: + self.normalizationFactors[i] = float(avgNbElements) / nbElements[i] + self.nbElements[i] *= self.normalizationFactors[i] + if self.verbosity > 1: + print "Normalizing to average # reads: %d" % (avgNbElements) + if self.simpleNormalizationParameters != None: + print "# reads: %s" % (", ".join([str(nbElement) for nbElement in self.nbElements])) + + def __del__(self): + self.mySqlConnection.deleteDatabase() + + def regionToString(self, transcript): + return "%s:%d-%d(%s)" % (transcript.getChromosome(), transcript.getStart(), transcript.getEnd(), "+" if transcript.getDirection() == 1 else "-") + + def stringToRegion(self, region): + m = re.search(r"^(\S+):(\d+)-(\d+)\((\S)\)$", region) + if m == None: + raise Exception("Internal format error: cannot parse region '%s'" % (region)) + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(int(m.group(2))) + transcript.setEnd(int(m.group(3))) + transcript.setDirection(m.group(4)) + return transcript + + def computeMinimumSize(self): + self.normalizationSize = 1000000000 + progress = Progress(self.transcriptContainerRef.getNbTranscripts(), "Getting minimum reference size", self.verbosity) + for transcriptRef in self.transcriptContainerRef.getIterator(): + self.normalizationSize = min(self.normalizationSize, transcriptRef.getEnd() - transcriptRef.getStart()) + progress.inc() + progress.done() + if self.verbosity > 1: + print "Minimum reference size: %d" % (self.normalizationSize+1) + + def useFixedSizeNormalization(self, start, end, starts): + currentNb = 0 + sum = 0 + if not starts: + return 0 + for i in range(start - self.normalizationSize, end + 1 + self.normalizationSize): + if i not in starts: + starts[i] = 0 + for i, s in starts.iteritems(): + if i < start: + starts[start] += s + starts[i] = 0 + for i in range(start - self.normalizationSize, end + 1): + currentNb += starts[i+self.normalizationSize] - starts[i] + sum += currentNb + return (float(sum) / self.normalizationSize) * (self.fixedSizeFactor / (end - start + 1)) + + def retrieveCounts(self, transcriptRef, i): + if transcriptRef.getChromosome() not in self.tables[i]: + return (0, 0) + cumulatedCount = 0 + cumulatedNormalizedCount = 0 + for exon in transcriptRef.getExons(): + count = 0 + starts = {} + command = "SELECT start, tags FROM '%s' WHERE start >= %d AND end <= %d" % (self.tables[i][exon.getChromosome()].getName(), exon.getStart(), exon.getEnd()) + if self.oriented: + command += " AND direction = %d" % (exon.getDirection()) + query = self.mySqlConnection.executeQuery(command) + for line in query.getIterator(): + nb = 1 + tags = line[1].split(";") + for tag in tags: + key, value = tag.split("=") + if key == "nbElements": + nb = int(float(value)) + count += nb + starts[int(line[0])] = nb + normalizedCount = count if self.fixedSizeFactor == None else self.useFixedSizeNormalization(exon.getStart(), exon.getEnd(), starts) + cumulatedCount += count + cumulatedNormalizedCount += normalizedCount + return (cumulatedCount, cumulatedNormalizedCount) + + def getAllCounts(self): + progress = Progress(self.transcriptContainerRef.getNbTranscripts(), "Getting counts", self.verbosity) + for cpt, transcriptRef in enumerate(self.transcriptContainerRef.getIterator()): + if "ID" in transcriptRef.getTagNames(): + self.regionsToNames[self.regionToString(transcriptRef)] = transcriptRef.getTagValue("ID") + elif transcriptRef.getName() != None: + self.regionsToNames[self.regionToString(transcriptRef)] = transcriptRef.getName() + else: + self.regionsToNames[self.regionToString(transcriptRef)] = "region_%d" % (cpt) + values = [None, None] + normalizedValues = [None, None] + for i in self.inputs: + values[i], normalizedValues[i] = self.retrieveCounts(transcriptRef, i) + normalizedValues[i] = int(self.normalizationFactors[i] * normalizedValues[i]) + if sum(values) != 0: + self.regionsToValues[self.regionToString(transcriptRef)] = (normalizedValues[0], normalizedValues[1], values[0], values[1]) + progress.inc() + progress.done() + + def computeAdjustedNormalizationFactors(self): + nbElements = len(self.regionsToValues.keys()) + avgValues = [] + progress = Progress(nbElements, "Normalization step 1", self.verbosity) + for values in self.regionsToValues.values(): + correctedValues = [values[i] * self.normalizationFactors[i] for i in self.inputs] + avgValues.append(float(sum(correctedValues)) / len(correctedValues)) + progress.inc() + progress.done() + + sortedAvgValues = sorted(avgValues) + minAvgValues = sortedAvgValues[nbElements / 4] + maxAvgValues = sortedAvgValues[nbElements * 3 / 4] + sums = [0, 0] + progress = Progress(nbElements, "Normalization step 2", self.verbosity) + for values in self.regionsToValues.values(): + correctedValues = [values[i] * self.normalizationFactors[i] for i in self.inputs] + avgValue = float(sum(correctedValues)) / len(correctedValues) + if minAvgValues <= avgValue and avgValue <= maxAvgValues: + for i in self.inputs: + sums[i] += values[i] + progress.inc() + progress.done() + + avgSums = float(sum(sums)) / len(sums) + for i in self.inputs: + if self.verbosity > 1: + print "Normalizing sample %d: %s to" % ((i+1), self.nbElements[i]), + self.normalizationFactors[i] *= float(avgSums) / sums[i] + self.nbElements[i] *= self.normalizationFactors[i] + if self.verbosity > 1: + print "%s" % (int(self.nbElements[i])) + + def getMinimumReferenceSize(self): + self.normalizationSize = 1000000000 + progress = Progress(self.transcriptContainerRef.getNbTranscripts(), "Reference element sizes", self.verbosity) + for transcriptRef in self.transcriptContainerRef.getIterator(): + self.normalizationSize = min(self.normalizationSize, transcriptRef.getEnd() - transcriptRef.getStart() + 1) + progress.inc() + progress.done() + if self.verbosity > 1: + print "Minimum reference size: %d" % (self.normalizationSize) + + def computePvalues(self): + normalizedValues = set() + progress = Progress(len(self.regionsToValues.keys()), "Normalizing counts", self.verbosity) + for region in self.regionsToValues: + values = self.regionsToValues[region] + normalizedValues0 = int(round(values[0] * self.normalizationFactors[0])) + normalizedValues1 = int(round(values[1] * self.normalizationFactors[1])) + self.regionsToValues[region] = (normalizedValues0, normalizedValues1, self.regionsToValues[region][2], self.regionsToValues[region][3]) + normalizedValues.add((normalizedValues0, normalizedValues1, self.nbElements[0] - normalizedValues0, self.nbElements[1] - normalizedValues1, self.regionsToValues[region][2], self.regionsToValues[region][3])) + progress.inc() + progress.done() + + if self.verbosity > 1: + print "Computing p-values..." + self.valuesToPvalues = Utils.fisherExactPValueBulk(list(normalizedValues)) + if self.verbosity > 1: + print "... done" + + def setTagValues(self, transcript, values, pValue): + for tag in transcript.getTagNames(): + transcript.deleteTag(tag) + transcript.removeExons() + transcript.setTagValue("pValue", str(pValue)) + transcript.setTagValue("nbReadsCond1", str(values[0])) + transcript.setTagValue("nbReadsCond2", str(values[1])) + transcript.setTagValue("nbUnnormalizedReadsCond1", str(values[2])) + transcript.setTagValue("nbUnnormalizedReadsCond2", str(values[3])) + if (values[0] == values[1]) or (self.fdr != None and pValue > self.fdrPvalue): + transcript.setTagValue("regulation", "equal") + elif values[0] < values[1]: + transcript.setTagValue("regulation", "up") + else: + transcript.setTagValue("regulation", "down") + return transcript + + def computeFdr(self): + pValues = [] + nbRegions = len(self.regionsToValues.keys()) + progress = Progress(nbRegions, "Computing FDR", self.verbosity) + for values in self.regionsToValues.values(): + pValues.append(self.valuesToPvalues[values[0:2]]) + progress.inc() + progress.done() + + for i, pValue in enumerate(reversed(sorted(pValues))): + if pValue <= self.fdr * (nbRegions - 1 - i) / nbRegions: + self.fdrPvalue = pValue + if self.verbosity > 1: + print "FDR: %f, k: %i, m: %d" % (pValue, nbRegions - 1 - i, nbRegions) + return + + def writeDifferentialExpression(self): + if self.plot: + self.setPlotter() + + cpt = 1 + progress = Progress(len(self.regionsToValues.keys()), "Writing output", self.verbosity) + for region, values in self.regionsToValues.iteritems(): + transcript = self.stringToRegion(region) + pValue = self.valuesToPvalues[values[0:2]] + transcript.setName(self.regionsToNames[region]) + transcript = self.setTagValues(transcript, values, pValue) + self.writer.addTranscript(transcript) + cpt += 1 + + if self.plot: + self.points[region] = (values[0], values[1]) + progress.done() + self.writer.write() + self.writer.close() + + if self.plot: + self.plotter.addLine(self.points) + self.plotter.plot() + + def getDifferentialExpression(self): + for i in self.inputs: + self.readInput(i) + + if self.simpleNormalization: + self.computeSimpleNormalizationFactors() + if self.fixedSizeFactor != None: + self.computeMinimumSize() + + self.getAllCounts() + + if self.adjustedNormalization: + self.computeAdjustedNormalizationFactors() + + self.computePvalues() + + if self.fdr != None: + self.computeFdr() + + self.writeDifferentialExpression() + + +if __name__ == "__main__": + + # parse command line + description = "Get Differential Expression v1.0.1: Get the differential expression between 2 conditions using Fisher's exact test, on regions defined by a third file. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-k", "--reference", dest="referenceFileName", action="store", type="string", help="reference file [compulsory] [format: file in transcript format given by -l]") + parser.add_option("-l", "--referenceFormat", dest="referenceFormat", action="store", type="string", help="format of reference file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in gff3 format]") + parser.add_option("-n", "--notOriented", dest="notOriented", action="store_true", default=False, help="if the reads are not oriented [default: False] [format: bool]") + parser.add_option("-s", "--simple", dest="simple", action="store_true", default=False, help="normalize using the number of reads in each condition [format: bool]") + parser.add_option("-S", "--simpleParameters", dest="simpleParameters", action="store", default=None, type="string", help="provide the number of reads [format: bool]") + parser.add_option("-a", "--adjusted", dest="adjusted", action="store_true", default=False, help="normalize using the number of reads of 'mean' regions [format: bool]") + parser.add_option("-x", "--fixedSizeFactor", dest="fixedSizeFactor", action="store", default=None, type="int", help="give the magnification factor for the normalization using fixed size sliding windows in reference regions (leave empty for no such normalization) [format: int]") + parser.add_option("-d", "--fdr", dest="fdr", action="store", default=None, type="float", help="use FDR [format: float]") + parser.add_option("-p", "--plot", dest="plotName", action="store", default=None, type="string", help="plot cloud plot [format: output file in PNG format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + + + differentialExpression = GetDifferentialExpression(options.verbosity) + differentialExpression.setInputFile(0, options.inputFileName1, options.format1) + differentialExpression.setInputFile(1, options.inputFileName2, options.format2) + differentialExpression.setReferenceFile(options.referenceFileName, options.referenceFormat) + differentialExpression.setOutputFile(options.outputFileName) + if options.plotName != None : + differentialExpression.setPlotterName(options.plotName) + differentialExpression.setPlotter() + differentialExpression.setOriented(not options.notOriented) + differentialExpression.setSimpleNormalization(options.simple) + differentialExpression.setSimpleNormalizationParameters(options.simpleParameters) + differentialExpression.setAdjustedNormalization(options.adjusted) + differentialExpression.setFixedSizeNormalization(options.fixedSizeFactor) + differentialExpression.setFdr(options.fdr) + differentialExpression.getDifferentialExpression() + differentialExpression.mySqlConnection.deleteDatabase() + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/GetDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/GetDistribution.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,362 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.MultipleRPlotter import MultipleRPlotter +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc.Progress import Progress + +TWOSTRANDS = {True: [1, -1], False: [0]} +STRANDTOSTR = {1: "(+)", -1: "(-)", 0: ""} + +class GetDistribution(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.sizes = None + self.twoStrands = False + self.start = 1 + self.names = ["nbElements"] + self.average = False + self.nbValues = {} + self.height = 300 + self.width = 600 + self.colors = None + self.gffFileName = None + self.csvFileName = None + self.yMin = None + self.yMax = None + self.chromosome = None + self.merge = False + self.nbTranscripts = None + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setReferenceFile(self, fileName): + if fileName == None: + return + fastaParser = FastaParser(fileName, self.verbosity) + self.chromosomes = fastaParser.getRegions() + self.sizes = dict([region, fastaParser.getSizeOfRegion(region)] for region in self.chromosomes) + self.maxSize = max(self.sizes.values()) + + def setRegion(self, chromosome, start, end): + if chromosome == None: + return + self.maxSize = options.end + self.sizes = {chromosome: end} + self.chromosomes = [chromosome] + self.chromosome = chromosome + self.start = start + self.end = end + + def setOutputFile(self, fileName): + self.outputFileName = fileName + + def setNbBins(self, nbBins): + self.nbBins = nbBins + + def set2Strands(self, twoStrands): + self.twoStrands = twoStrands + + def setNames(self, names): + self.names = names + + def setAverage(self, average): + self.average = average + + def setNormalization(self, normalization): + self.normalization = normalization + + def setImageSize(self, height, width): + self.height = height + self.width = width + + def setYLimits(self, yMin, yMax): + self.yMin = yMin + self.yMax = yMax + + def setColors(self, colors): + self.colors = colors + + def writeGff(self, fileName): + self.gffFileName = fileName + + def writeCsv(self, fileName): + self.csvFileName = fileName + + def mergePlots(self, merge): + self.merge = merge + + def _estimateSizes(self): + progress = UnlimitedProgress(10000, "Reading input for chromosome size estimate", self.verbosity) + self.sizes = {} + for self.nbTranscripts, transcript in enumerate(self.parser.getIterator()): + chromosome = transcript.getChromosome() + start = transcript.getStart() + self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0)) + progress.inc() + progress.done() + + def _computeSliceSize(self): + if self.nbBins == 0: + return + tmp1 = int(max(self.sizes.values()) / float(self.nbBins)) + tmp2 = 10 ** (len("%d" % (tmp1))-2) + self.sliceSize = max(1, int((tmp1 / tmp2) * tmp2)) + if self.verbosity > 0: + print "choosing bin size of %d" % (self.sliceSize) + + def _initBins(self): + self.bins = {} + for chromosome in self.sizes: + self.bins[chromosome] = {} + for name in self.names: + self.bins[chromosome][name] = {} + for strand in TWOSTRANDS[self.twoStrands]: + if self.nbBins == 0: + self.bins[chromosome][name][strand] = {} + else: + self.bins[chromosome][name][strand] = dict([(i * self.sliceSize + 1, 0.0) for i in range(self.start / self.sliceSize, self.sizes[chromosome] / self.sliceSize + 1)]) + + def _populateBins(self): + if self.nbTranscripts == None: + progress = UnlimitedProgress(10000, "Counting data", self.verbosity) + else: + progress = Progress(self.nbTranscripts, "Counting data", self.verbosity) + for transcript in self.parser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + progress.inc() + chromosome = transcript.getChromosome() + start = transcript.getStart() + if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end): + continue + strand = transcript.getDirection() if self.twoStrands else 0 + if self.nbBins != 0: + bin = (start / self.sliceSize) * self.sliceSize + 1 + else: + bin = start + for name in self.names: + value = float(transcript.tags.get(name, 1)) + self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value + self.nbValues[name] = self.nbValues.get(name, 0) + value + progress.done() + + def _normalize(self): + average = float(sum(self.nbValues)) / len(self.nbValues.keys()) + factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues) + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + self.bins[chromosome][name][strand][bin] *= factors[name] + + def _computeAverage(self): + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + self.bins[chromosome][name][strand][bin] = float(self.bins[chromosome][name][strand][bin]) / self.sliceSize + + def _getPlotter(self, chromosome): + plot = RPlotter("%s_%s.png" % (os.path.splitext(self.outputFileName)[0], chromosome), self.verbosity) + plot.setImageSize(self.width, self.height) + if self.sizes[chromosome] <= 1000: + unit = "nt." + ratio = 1.0 + elif self.sizes[chromosome] <= 1000000: + unit = "kb" + ratio = 1000.0 + else: + unit = "Mb" + ratio = 1000000.0 + if self.yMin != None: + plot.setMinimumY(self.yMin) + if self.yMax != None: + plot.setMaximumY(self.yMax) + plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) + plot.setLegend(True) + for i, name in enumerate(self.bins[chromosome]): + for strand in self.bins[chromosome][name]: + fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand]) + factor = 1 if strand == 0 else strand + correctedLine = dict([(key / ratio, value * factor) for key, value in self.bins[chromosome][name][strand].iteritems()]) + plot.addLine(correctedLine, fullName, self.colors[i] if self.colors else None) + return plot + + def _plot(self): + if self.merge: + multiplePlot = MultipleRPlotter(self.outputFileName, self.verbosity) + multiplePlot.setImageSize(self.width, self.height * len(self.bins.keys())) + progress = Progress(len(self.bins.keys()), "Plotting", options.verbosity) + for chromosome in sorted(self.bins.keys()): + plot = self._getPlotter(chromosome) + if self.merge: + multiplePlot.addPlot(plot) + else: + plot.plot() + progress.inc() + if self.merge: + multiplePlot.plot() + progress.done() + + def _writeCsv(self): + if self.verbosity > 1: + print "Writing CSV file..." + csvHandle = open(self.csvFileName, "w") + csvHandle.write("chromosome;tag;strand") + if self.nbBins != 0: + xValues = range(self.start / self.sliceSize, max(self.sizes.values()) / self.sliceSize + 1) + for value in xValues: + csvHandle.write(";%d-%d" % (value * self.sliceSize + 1, (value+1) * self.sliceSize)) + csvHandle.write("\n") + else: + xValues = [] + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + xValues.extend(self.bins[chromosome][name][strand].keys()) + xValues = sorted(list(set(xValues))) + for value in xValues: + csvHandle.write(";%d" % (value)) + csvHandle.write("\n") + for chromosome in self.bins: + csvHandle.write("%s" % (chromosome)) + for name in self.bins[chromosome]: + csvHandle.write(";%s" % (name)) + for strand in self.bins[chromosome][name]: + csvHandle.write(";%s" % (STRANDTOSTR[strand])) + for bin in xValues: + csvHandle.write(";%.2f" % (self.bins[chromosome][name][strand].get(bin, 0))) + csvHandle.write("\n") + csvHandle.write(";") + csvHandle.write(";") + csvHandle.close() + if self.verbosity > 1: + print "...done" + + def _writeGff(self): + if self.verbosity > 1: + print "Writing GFF file..." + writer = Gff3Writer(self.gffFileName, self.verbosity) + cpt = 1 + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setStart(bin) + if self.nbBins > 0: + transcript.setEnd(bin + self.sliceSize) + else: + transcript.setEnd(start) + transcript.setDirection(1 if strand == 0 else strand) + transcript.setTagValue("ID", "region%d" % (cpt)) + cpt += 1 + writer.write() + if self.verbosity > 1: + print "...done" + + def run(self): + if self.sizes == None: + self._estimateSizes() + self._computeSliceSize() + self._initBins() + self._populateBins() + if self.normalization: + self._normalize() + if self.average: + self._computeAverage() + self._plot() + if self.csvFileName != None: + self._writeCsv() + if self.gffFileName != None: + self._writeGff() + + +if __name__ == "__main__": + + description = "Get Distribution v1.0.2: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]") + parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]") + parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]") + parser.add_option("-H", "--height", dest="height", action="store", default=300, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=600, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]") + parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") + parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + gt = GetDistribution(options.verbosity) + gt.setInputFile(options.inputFileName, options.format) + gt.setOutputFile(options.outputFileName) + gt.setReferenceFile(options.referenceFileName) + gt.setNbBins(int(options.nbBins)) + gt.set2Strands(options.bothStrands) + gt.setRegion(options.chromosome, options.start, options.end) + gt.setNormalization(options.normalize) + gt.setAverage(options.average) + gt.setYLimits(options.yMin, options.yMax) + gt.writeCsv(options.csv) + gt.writeGff(options.gff) + gt.setImageSize(options.height, options.width) + gt.setNames(options.names.split(",")) + gt.setColors(None if options.colors == None else options.colors.split(",")) + gt.setNormalization(options.normalize) + gt.mergePlots(options.mergePlots) + gt.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/GetFlanking.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/GetFlanking.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,231 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.misc.Progress import Progress + +QUERY = 0 +REFERENCE = 1 +INPUTS = (QUERY, REFERENCE) +STRANDS = (-1, 1) +TAG_DISTANCE = "distance_" +TAG_SENSE = "_sense" +TAG_REGION = "_region" +TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"} +TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"} +TAGS_SENSE = {-1: "antisense", 0: "", 1: "colinear"} +STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"} + + +def getOrderKey(transcript, direction): + if direction == 1: + return transcript.getEnd() + return - transcript.getStart() + +def isInGoodRegion(transcriptRef, transcriptQuery, direction): + if direction == 1: + return transcriptQuery.getEnd() > transcriptRef.getEnd() + return transcriptQuery.getStart() < transcriptRef.getStart() + + +class GetFlanking(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.transcripts = dict([id, {}] for id in INPUTS) + self.directions = [] + self.noOverlap = False + self.colinear = False + self.antisense = False + self.distance = None + self.minDistance = None + self.maxDistance = None + self.tagName = "flanking" + + def setInputFile(self, fileName, format, id): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.transcripts[id]: + self.transcripts[id][chromosome] = [] + self.transcripts[id][chromosome].append(transcript) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def addUpstreamDirection(self, upstream): + if upstream: + self.directions.append(-1) + + def addDownstreamDirection(self, downstream): + if downstream: + self.directions.append(1) + + def setColinear(self, colinear): + self.colinear = colinear + + def setAntisense(self, antisense): + self.antisense = antisense + + def setNoOverlap(self, noOverlap): + self.noOverlap = noOverlap + + def setMinDistance(self, distance): + self.minDistance = distance + + def setMaxDistance(self, distance): + self.maxDistance = distance + + def setNewTagName(self, tagName): + self.tagName = tagName + + def match(self, transcriptRef, transcriptQuery, direction): + if self.noOverlap and transcriptRef.overlapWith(transcriptQuery): + return False + if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection(): + return False + if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection(): + return False + if self.minDistance != None or self.maxDistance != None: + distance = transcriptRef.getDistance(transcriptQuery) + if self.minDistance != None and distance < self.minDistance: + return False + if self.maxDistance != None and distance > self.maxDistance: + return False + return True + + def getFlanking(self, direction): + for chromosome in sorted(self.transcripts[REFERENCE].keys()): + if chromosome not in self.transcripts[QUERY]: + continue + sortedTranscripts = dict([id, {}] for id in INPUTS) + for id in INPUTS: + sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction)) + refIndex = 0 + currentRefs = [] + outputs = set() + progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity) + for query in sortedTranscripts[QUERY]: + while refIndex < len(sortedTranscripts[REFERENCE]) and isInGoodRegion(sortedTranscripts[REFERENCE][refIndex], query, direction): + currentRefs.append(sortedTranscripts[REFERENCE][refIndex]) + refIndex += 1 + nextCurrentRefs = [] + for currentRef in currentRefs: + if self.match(currentRef, query, direction): + if currentRef not in self.flankings: + self.flankings[currentRef] = {} + self.flankings[currentRef][direction * currentRef.getDirection()] = query + else: + nextCurrentRefs.append(currentRef) + currentRefs = nextCurrentRefs + progress.inc() + progress.done() + + def setTags(self, query, reference, direction): + refName = reference.getTagValue("ID") + if refName == None: + refName = reference.getName() + if refName == None: + refName = reference.__str__() + query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction]), refName) + query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction]), query.getDistance(reference)) + if direction == 0: + query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()]) + query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)]) + for tag in reference.getTagNames(): + if tag not in ("quality", "feature"): + query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction], tag), reference.getTagValue(tag)) + return query + + def write(self): + outputs = set() + progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity) + for transcriptRef in self.flankings.keys(): + if self.directions: + for direction in self.directions: + if direction in self.flankings[transcriptRef]: + query = self.flankings[transcriptRef][direction] + outputs.add(self.setTags(query, transcriptRef, direction)) + else: + if self.flankings[transcriptRef]: + query = sorted(self.flankings[transcriptRef].values(), key = lambda query: query.getDistance(transcriptRef))[0] + outputs.add(self.setTags(query, transcriptRef, 0)) + progress.inc() + for transcript in sorted(list(outputs), key = lambda flanking: (flanking.getChromosome(), flanking.getStart(), flanking.getEnd())): + self.writer.addTranscript(transcript) + self.writer.close() + progress.done() + + def run(self): + self.flankings = {} + for direction in STRANDS: + self.getFlanking(direction) + self.write() + +if __name__ == "__main__": + + description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]") + parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]") + parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]") + parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]") + parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]") + parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + gf = GetFlanking(options.verbosity) + gf.setInputFile(options.inputFileName1, options.format1, QUERY) + gf.setInputFile(options.inputFileName2, options.format2, REFERENCE) + gf.setOutputFile(options.outputFileName) + gf.addUpstreamDirection(options.upstream) + gf.addDownstreamDirection(options.downstream) + gf.setColinear(options.colinear) + gf.setAntisense(options.antisense) + gf.setNoOverlap(options.noOverlap) + gf.setMinDistance(options.minDistance) + gf.setMaxDistance(options.maxDistance) + gf.setNewTagName(options.tagName) + gf.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/GetRandomSubset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/GetRandomSubset.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,96 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +class GetRandomSubset(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setNumber(self, number, percent): + if number != None: + self.number = number + elif percent != None: + self.number = int(float(percent) / 100 * self.parser.getNbTranscripts()) + else: + raise Exception("Error! Number of elements to output is not given!") + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def chooseElements(self): + self.randomIndices = random.sample(range(self.parser.getNbTranscripts()), self.number) + + def run(self): + self.chooseElements() + progress = Progress(self.parser.getNbTranscripts(), "Reading input file", self.verbosity) + nbWritten = 0 + for cpt1, transcript in enumerate(self.parser.getIterator()): + if cpt1 in self.randomIndices: + self.writer.addTranscript(transcript) + nbWritten += 1 + progress.inc() + self.writer.write() + self.writer.close() + progress.done() + if self.verbosity > 1: + print "%d transcripts read" % (self.parser.getNbTranscripts()) + print "%d transcripts written" % (nbWritten) + + +if __name__ == "__main__": + + description = "Get Random Subset v1.0.1: Get a random sub-set of a list of genomic coordinates. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="string", help="number of elements to output [format: int]") + parser.add_option("-p", "--percent", dest="percent", action="store", default=None, type="string", help="percentage of elements to output (between 0 and 100) [format: int]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + grs = GetRandomSubset(options.verbosity) + grs.setInputFile(options.inputFileName, options.format) + grs.setNumber(options.number, options.percent) + grs.setOutputFile(options.outputFileName) + grs.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/GetReadDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/GetReadDistribution.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,283 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random, os, glob, subprocess +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser + +LOG_DEPTH = "smart" +DEFAULT_REGION = "_all_" +MULTIPLE_STR = {1: "", 1000: " (in kpb)", 1000000: " (in Gbp)"} + +class GetReadDistribution(object): + + def __init__(self, verbosity = 0): + self.xLab = "" + self.yLab = "# reads" + self.verbosity = verbosity + self.number = random.randint(0, 100000) + self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) + self.parsers = {} + self.distribution = {} + self.factors = {} + self.regions = None + self.tmpDatName = None + self.tmpRName = None + self.quorum = 1 + self.width = 800 + self.height = 300 + + def setNames(self, names): + self.names = names + + def setInputFiles(self, fileNames, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + for cpt, fileName in enumerate(fileNames): + self.parsers[self.names[cpt]] = chooser.getParser(fileName) + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setLabs(self, xLab, yLab): + self.xLab = xLab + self.yLab = yLab + + def setBinSize(self, binSize): + self.binSize = binSize + + def setColors(self, colors): + self.colors = colors + + def setFactors(self, factors): + self.factors = dict(zip(self.names, factors)) + + def setMultiple(self, boolean): + self.multiple = boolean + + def setImageSize(self, width, height): + if width != None: + self.width = width + if height != None: + self.height = height + + def setQuorum(self, quorum): + self.quorum = quorum + + def setRegionsFile(self, fileName): + if fileName != None: + self._loadRegions(fileName) + + def _checkOptions(self): + if not self.parsers: + self.logAndRaise("ERROR: Missing input file names") + + def _logAndRaise(self, errorMsg): + self.log.error(errorMsg) + raise Exception(errorMsg) + + def _loadRegions(self, fileName): + self.regions = {} + parser = GffParser(fileName, self.verbosity) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + start = transcript.getStart() + end = transcript.getEnd() + name = transcript.getName() + if chromosome not in self.regions: + self.regions[chromosome] = {} + if start not in self.regions[chromosome]: + self.regions[chromosome][start] = {} + if end not in self.regions[chromosome][start]: + self.regions[chromosome][start][end] = [] + self.regions[chromosome][start][end].append(name) + + def _getRegions(self, transcript): + if self.regions == None: + return [DEFAULT_REGION] + chromosome = transcript.getChromosome() + start = transcript.getStart() + end = transcript.getEnd() + if chromosome not in self.regions: + return [] + names = [] + for loadedStart in sorted(self.regions[chromosome].keys()): + if loadedStart > end: + return names + for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())): + if loadedEnd < start: + break + names.extend(self.regions[chromosome][loadedStart][loadedEnd]) + return names + + def _parse(self, name): + progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity) + for transcript in self.parsers[name].getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + regions = self._getRegions(transcript) + for region in regions: + if region not in self.distribution: + self.distribution[region] = {} + if name not in self.distribution[region]: + self.distribution[region][name] = {} + chromosome = transcript.getChromosome() + nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1 + nbElements *= self.factors.get(name, 1) + if chromosome not in self.distribution[region][name]: + self.distribution[region][name][chromosome] = {} + previousBin = None + for exon in transcript.getExons(): + for pos in range(exon.getStart(), exon.getEnd()+1): + bin = pos / self.binSize + if bin != previousBin: + self.distribution[region][name][chromosome][bin] = self.distribution[region][name][chromosome].get(bin, 0) + nbElements + previousBin = bin + progress.inc() + progress.done() + + def _checkQuorum(self, region): + if self.quorum == None: + return True + return max([max([max(self.distribution[region][name][chromosome].values()) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) >= self.quorum + + def _writeData(self, region): + self.tmpDatName = "tmpFile%d.dat" % (self.number) + handle = open(self.tmpDatName, "w") + handle.write("Chr\tPos\tCount\tSample\n") + for name in self.distribution[region]: + for chromosome in sorted(self.distribution[region][name].keys()): + for pos in sorted(self.distribution[region][name][chromosome].keys()): + handle.write("%s\t%d\t%d\t\"%s\"\n" % (chromosome, pos * self.binSize, self.distribution[region][name][chromosome].get(pos, 0), name)) + handle.close() + + def _findMultiple(self, region): + if not self.multiple: + return 1 + maxPosition = max([self.distribution[region][name][chromosome].keys() for name in self.distribution[region] for chromosome in self.distribution[region][name]]) + if maxPosition > 2000000: + return 1000000 + elif maxPosition > 2000: + return 1000 + return 1 + + def _writeScript(self, region): + self.tmpRName = "tmpFile%d.R" % (self.number) + fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region) + colors = "scale_fill_brewer(palette=\"Set1\") + scale_color_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s)) + scale_color_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]), ", ".join(["\"%s\"" % (color) for color in self.colors])) + title = "" if region == DEFAULT_REGION else " of %s" % (region) + facet = "Sample ~ Chr" if region == DEFAULT_REGION else "Sample ~ ." + handle = open(self.tmpRName, "w") + multiple = self._findMultiple(region) + handle.write("library(ggplot2)\n") + handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName)) + handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names]))) + handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height)) + handle.write("ggplot(data, aes(x = Pos/%d, y = Count, fill = Sample, color = Sample)) + opts(title = \"Distribution%s\") + geom_bar(stat = \"identity\") + facet_grid(%s, space=\"free\") + xlab(\"%s%s\") + ylab(\"%s\") + %s + opts(legend.position = \"none\", panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(), panel.background = theme_blank())\n" % (multiple, title, facet, self.xLab, MULTIPLE_STR[multiple], self.yLab, colors)) + handle.write("dev.off()\n") + + def _runR(self): + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName) + status = subprocess.call(command, shell=True) + if status != 0: + raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status)) + + def _plot(self): + progress = Progress(len(self.distribution), "Plotting data", self.verbosity) + for region in self.distribution: + if not self._checkQuorum(region): + self.log.info("Not displaying '%s' for it contains insufficient data." % (region)) + else: + self._writeData(region) + self._writeScript(region) + self._runR() + progress.inc() + progress.done() + + def _cleanFiles(self): + for fileName in (self.tmpDatName, self.tmpRName): + if fileName != None and os.path.exists(fileName): + os.remove(fileName) + for otherFileName in glob.glob("%s*" % (fileName)): + os.remove(otherFileName) + + def run(self): + LoggerFactory.setLevel(self.log, self.verbosity) + self._checkOptions() + self.log.info("START Get Read Distribution") + for name in self.names: + self._parse(name) + self._plot() + self._cleanFiles() + self.log.info("END Get Read Distribution") + + +if __name__ == "__main__": + description = "Usage: GetReadDistribution.py [options]\n\nGet Read Distribution v1.0.1: Get the distribution of a set of reads. [Category: Personal]\n" + epilog = "" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") + parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") + parser.add_option("-s", "--binSize", dest="binSize", action="store", default=10000, type="int", help="bin size [format: int] [default: 10000]") + parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="", type="string", help="x-axis label name [format: string]") + parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]") + parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]") + parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]") + parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]") + parser.add_option("-m", "--multiple", dest="multiple", action="store_true", default=False, help="print position using multiples (k, G) [format: boolean] [default: False]") + parser.add_option("-q", "--quorum", dest="quorum", action="store", default=1, type="int", help="minimum number of intervals to plot a region [format: int] [default: 1]") + parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]") + parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + options = parser.parse_args()[0] + iGetReadDistribution = GetReadDistribution(options.verbosity) + iGetReadDistribution.setNames(options.names.split(",")) + iGetReadDistribution.setInputFiles(options.inputFileNames.split(","), options.format) + iGetReadDistribution.setOutputFileName(options.outputFileName) + iGetReadDistribution.setLabs(options.xLab, options.yLab) + iGetReadDistribution.setBinSize(options.binSize) + iGetReadDistribution.setColors(None if options.colors == None else options.colors.split(",")) + iGetReadDistribution.setFactors(None if options.factors == None else map(float, options.factors.split(","))) + iGetReadDistribution.setRegionsFile(options.regionsFileName) + iGetReadDistribution.setMultiple(options.multiple) + iGetReadDistribution.setQuorum(options.quorum) + iGetReadDistribution.setImageSize(options.width, options.height) + iGetReadDistribution.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/GetReadSizes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/GetReadSizes.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,255 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random, os, glob, subprocess +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser + +LOG_DEPTH = "smart" +DEFAULT_REGION = "_all_" + +class GetReadSizes(object): + + def __init__(self, verbosity = 0): + self.xLab = "Size" + self.yLab = "# reads" + self.verbosity = verbosity + self.number = random.randint(0, 100000) + self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) + self.parsers = {} + self.sizes = {} + self.factors = {} + self.regions = None + self.tmpDatName = None + self.tmpRName = None + self.width = 800 + self.height = 300 + + def setNames(self, names): + self.names = names + + def setInputFiles(self, fileNames, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + for cpt, fileName in enumerate(fileNames): + self.parsers[self.names[cpt]] = chooser.getParser(fileName) + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setLabs(self, xLab, yLab): + self.xLab = xLab + self.yLab = yLab + + def setSizes(self, minSize, maxSize): + self.minSize = minSize + self.maxSize = maxSize + + def setColors(self, colors): + self.colors = colors + + def setFactors(self, factors): + self.factors = dict(zip(self.names, factors)) + + def setRegionsFile(self, fileName): + if fileName != None: + self._loadRegions(fileName) + + def setImageSize(self, width, height): + if width != None: + self.width = width + if height != None: + self.height = height + + def _checkOptions(self): + if not self.parsers: + self.logAndRaise("ERROR: Missing input file names") + + def _logAndRaise(self, errorMsg): + self.log.error(errorMsg) + raise Exception(errorMsg) + + def _loadRegions(self, fileName): + self.regions = {} + parser = GffParser(fileName, self.verbosity) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + start = transcript.getStart() + end = transcript.getEnd() + name = transcript.getName() + if chromosome not in self.regions: + self.regions[chromosome] = {} + if start not in self.regions[chromosome]: + self.regions[chromosome][start] = {} + if end not in self.regions[chromosome][start]: + self.regions[chromosome][start][end] = [] + self.regions[chromosome][start][end].append(name) + + def _getRegions(self, transcript): + if self.regions == None: + return [DEFAULT_REGION] + chromosome = transcript.getChromosome() + start = transcript.getStart() + end = transcript.getEnd() + if chromosome not in self.regions: + return [] + names = [] + for loadedStart in sorted(self.regions[chromosome].keys()): + if loadedStart > end: + return names + for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())): + if loadedEnd < start: + break + names.extend(self.regions[chromosome][loadedStart][loadedEnd]) + return names + + def _parse(self, name): + progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity) + for transcript in self.parsers[name].getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + regions = self._getRegions(transcript) + for region in regions: + if region not in self.sizes: + self.sizes[region] = {} + if name not in self.sizes[region]: + self.sizes[region][name] = {} + size = transcript.getSize() + if (self.minSize == None or size >= self.minSize) and (self.maxSize == None or size <= self.maxSize): + nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1 + nbElements *= self.factors.get(name, 1) + self.sizes[region][name][size] = self.sizes[region][name].get(size, 0) + nbElements + progress.inc() + progress.done() + if self.minSize == None: + self.minSize = min([min(self.sizes[region][name].keys()) for name in self.names for region in region]) + if self.maxSize == None: + self.maxSize = max([max(self.sizes[region][name].keys()) for name in self.names for region in region]) + + def _checkQuorum(self, region): + return (max([sum(self.sizes[region][name].values()) for name in self.sizes[region]]) > 0) + + def _writeData(self, region): + self.tmpDatName = "tmpFile%d.dat" % (self.number) + handle = open(self.tmpDatName, "w") + handle.write("Size\tCount\tSample\n") + for name in self.sizes[region]: + for size in sorted(self.sizes[region][name].keys()): + handle.write("%d\t%d\t\"%s\"\n" % (size, self.sizes[region][name].get(size, 0), name)) + handle.close() + + def _writeScript(self, region): + self.tmpRName = "tmpFile%d.R" % (self.number) + fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region) + colors = "scale_fill_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors])) + title = "" if region == DEFAULT_REGION else " of %s" % (region) + handle = open(self.tmpRName, "w") + handle.write("library(ggplot2)\n") + handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName)) + handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names]))) + handle.write("data$Size <- factor(data$Size, levels=c(%s))\n" % (", ".join(["%d" % (size) for size in range(self.minSize, self.maxSize+1)]))) + handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height)) + handle.write("ggplot(data, aes(x = Size, y = Count, fill = Size)) + opts(title = \"Size distribution%s\") + geom_bar(stat = \"identity\") + facet_grid(. ~ Sample, space=\"free_x\") + xlab(\"%s\") + ylab(\"%s\") + %s + opts(legend.position = \"none\", panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(), panel.background = theme_blank())\n" % (title, self.xLab, self.yLab, colors)) + handle.write("dev.off()\n") + + def _runR(self): + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName) + status = subprocess.call(command, shell=True) + if status != 0: + raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status)) + + def _plot(self): + progress = Progress(len(self.sizes), "Plotting data", self.verbosity) + for region in self.sizes: + if not self._checkQuorum(region): + self.log.info("Not displaying '%s' for it contains no data." % (region)) + else: + self._writeData(region) + self._writeScript(region) + self._runR() + progress.inc() + progress.done() + + def _cleanFiles(self): + for fileName in (self.tmpDatName, self.tmpRName): + if fileName != None and os.path.exists(fileName): + os.remove(fileName) + for otherFileName in glob.glob("%s*" % (fileName)): + os.remove(otherFileName) + + def run(self): + LoggerFactory.setLevel(self.log, self.verbosity) + self._checkOptions() + self.log.info("START Get Read Sizes") + for name in self.names: + self._parse(name) + self._plot() + self._cleanFiles() + self.log.info("END Get Read Sizes") + + +if __name__ == "__main__": + description = "Usage: GetReadSizes.py [options]\n\nGet Read Sizes v1.0.1: Get the sizes of a set of reads. [Category: Personal]\n" + epilog = "" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") + parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") + parser.add_option("-s", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size [format: int]") + parser.add_option("-S", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size [format: int]") + parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x-axis label name [format: string] [default: Size]") + parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]") + parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]") + parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]") + parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]") + parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]") + parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + options = parser.parse_args()[0] + iGetReadSizes = GetReadSizes(options.verbosity) + iGetReadSizes.setNames(options.names.split(",")) + iGetReadSizes.setInputFiles(options.inputFileNames.split(","), options.format) + iGetReadSizes.setOutputFileName(options.outputFileName) + iGetReadSizes.setLabs(options.xLab, options.yLab) + iGetReadSizes.setSizes(options.minSize, options.maxSize) + iGetReadSizes.setColors(None if options.colors == None else options.colors.split(",")) + iGetReadSizes.setFactors(None if options.factors == None else map(float, options.factors.split(","))) + iGetReadSizes.setRegionsFile(options.regionsFileName) + iGetReadSizes.setImageSize(options.width, options.height) + iGetReadSizes.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/GetUpDownStream.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/GetUpDownStream.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,152 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +from optparse import OptionParser, OptionGroup +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + + +class GetUpDownStream(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.inputReader = None + self.outputWriter = None + self.nbRead = 0 + self.nbWritten = 0 + self.nbMerges = 0 + self.splittedFileNames = {} + + def __del__(self): + for fileName in self.splittedFileNames.values(): + os.remove(fileName) + + def setInputFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format, "transcript") + self.parser = parserChooser.getParser(fileName) + self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setOutputFile(self, fileName): + self.outputWriter = Gff3Writer(fileName, self.verbosity) + + def setDistances(self, up, down): + self.upDistance = up + self.downDistance = down + + def _sortFile(self): + fs = FileSorter(self.parser, self.verbosity-4) + fs.perChromosome(True) + fs.setOutputFileName(self.sortedFileName) + fs.sort() + self.splittedFileNames = fs.getOutputFileNames() + self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self.nbRead = fs.getNbElements() + + def _write(self, start, end, reference, after): + if start > end: + return + transcript = Transcript() + transcript.setChromosome(reference.getChromosome()) + transcript.setStart(start) + transcript.setEnd(end) + transcript.setDirection("+") + transcript.setName("%s_%s" % ("up" if Utils.xor(reference.getDirection() == 1, after) else "down", reference.getName())) + self.outputWriter.addTranscript(transcript) + + def _getFlanking(self, chromosome): + progress = Progress(self.nbElementsPerChromosome[chromosome], "Analyzing chromosome %s" % (chromosome), self.verbosity) + parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) + previous = None + for transcript in parser.getIterator(): + progress.inc() + transcript.removeExons() + if previous == None: + distance = self.upDistance if transcript.getDirection() == 1 else self.downDistance + start = max(1, transcript.getStart() - distance) + self._write(start, transcript.getStart()-1, transcript, False) + previous = transcript + continue + if previous.include(transcript): + continue + if transcript.overlapWith(previous): + previous = transcript + continue + distancePrevious = self.downDistance if previous.getDirection() == 1 else self.upDistance + distanceCurrent = self.upDistance if transcript.getDirection() == 1 else self.downDistance + distance = transcript.getDistance(previous) + if distancePrevious + distanceCurrent == 0: + previous = transcript + continue + if distance >= distancePrevious + distanceCurrent: + endPrevious = previous.getEnd() + distancePrevious + startCurrent = transcript.getStart() - distanceCurrent + else: + middle = previous.getEnd() + int((distance-1) * float(distancePrevious) / (distancePrevious + distanceCurrent)) + endPrevious = middle + startCurrent = middle+1 + self._write(previous.getEnd() + 1, endPrevious, previous, True) + self._write(startCurrent, transcript.getStart() - 1, transcript, False) + previous = transcript + distance = self.downDistance if previous.getDirection() == 1 else self.upDistance + self._write(previous.getEnd() + 1, previous.getEnd() + distance, previous, True) + progress.done() + + def run(self): + self._sortFile() + for chromosome in sorted(self.nbElementsPerChromosome.keys()): + self._getFlanking(chromosome) + self.outputWriter.close() + +if __name__ == "__main__": + + # parse command line + description = "Get Up and Down Stream v1.0.0: Get the flanking regions of an annotation. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in mapping format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: mapping file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-u", "--up", dest="up", action="store", default=0, type="int", help="the upstream distance [format: int]") + parser.add_option("-d", "--down", dest="down", action="store", default=0, type="int", help="the downstream distance [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + guds = GetUpDownStream(options.verbosity) + guds.setInputFile(options.inputFileName, options.format) + guds.setOutputFile(options.outputFileName) + guds.setDistances(options.up, options.down) + guds.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/RestrictFromCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/RestrictFromCoverage.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,224 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, struct, time, random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc import Utils +try: + import cPickle as pickle +except: + import pickle + +REFERENCE = 0 +QUERY = 1 +TYPES = (REFERENCE, QUERY) +TYPETOSTRING = {0: "reference", 1: "query"} + +class RestrictFromCoverage(object): + + def __init__(self, verbosity = 1): + self._verbosity = verbosity + self._randomNumber = random.randint(0, 100000) + self._nbWritten = 0 + self._nbLines = dict([type, 0] for type in TYPES) + self._splittedFileNames = dict([type, {}] for type in TYPES) + self._nbElementsPerChromosome = dict([type, {}] for type in TYPES) + self._nbElements = dict([type, 0] for type in TYPES) + + def __del__(self): + pass + + def _close(self): + self._writer.close() + + def setInputFileName(self, fileName, format, type): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + sortedFileName = "%s_%d_%d_sorted.pkl" % (os.path.splitext(fileName)[0], self._randomNumber, type) + if self._verbosity > 2: + print "Preparing %s file..." % (TYPETOSTRING[type]) + startTime = time.time() + fs = FileSorter(parser, self._verbosity-1) + fs.perChromosome(True) + fs.setOutputFileName(sortedFileName) + fs.sort() + self._nbLines[type] = fs.getNbElements() + self._splittedFileNames[type] = fs.getOutputFileNames() + self._nbElementsPerChromosome[type] = fs.getNbElementsPerChromosome() + self._nbElements[type] = fs.getNbElements() + endTime = time.time() + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def setOutputFileName(self, outputFileName): + self._writer = Gff3Writer(outputFileName) + + def setPercent(self, minPercent, maxPercent): + self._minPercent = minPercent + self._maxPercent = maxPercent + + def setNbNucleotides(self, minNb, maxNb): + self._minNucleotides = minNb + self._maxNucleotides = maxNb + + def setOverlap(self, minOverlap, maxOverlap): + self._minOverlap = minOverlap + self._maxOverlap = maxOverlap + + def setStrands(self, boolean): + self._twoStrands = boolean + + def _compareChromosome(self, chromosome): + firstOverlap = 0 + parser1 = NCListFileUnpickle(self._splittedFileNames[QUERY][chromosome], self._verbosity) + parser2 = NCListFileUnpickle(self._splittedFileNames[REFERENCE][chromosome], self._verbosity) + progress = Progress(self._nbElementsPerChromosome[QUERY][chromosome], "Analyzing %s" % (chromosome), self._verbosity) + for transcript1 in parser1.getIterator(): + firstOverlap = self._compareList(transcript1, parser2) + parser2.setInitAddress(firstOverlap) + progress.inc() + progress.done() + + def _compareList(self, transcript1, parser2): + values = [] + for exon in transcript1.getExons(): + values.append([0.0] * exon.getSize()) + firstOverlap = None + for transcript2 in parser2.getIterator(): + address = parser2.getCurrentTranscriptAddress() + nbElements = float(transcript2.getTagValue("nbElements")) if "nbElements" in transcript2.getTagNames() else 1.0 + nbOccurrences = float(transcript2.getTagValue("nbOccurrences")) if "nbOccurrences" in transcript2.getTagNames() else 1.0 + nbElements /= nbOccurrences + if transcript2.getStart() > transcript1.getEnd(): + if firstOverlap == None: + firstOverlap = address + if self._checkValues(values): + self._printTranscript(transcript1) + return firstOverlap + elif transcript1.overlapWith(transcript2): + if firstOverlap == None: + firstOverlap = address + values = self._compareTranscript(transcript1, transcript2, values, nbElements) + if self._checkValues(values): + self._printTranscript(transcript1) + return firstOverlap + + def _compareTranscript(self, transcript1, transcript2, values, nbElements): + if not transcript1.overlapWith(transcript2) or ((self._twoStrands) and transcript1.getDirection() != transcript2.getDirection()): + return values + for id1, exon1 in enumerate(transcript1.getExons()): + for exon2 in transcript2.getExons(): + values[id1] = map(sum, zip(values[id1], self._compareExon(exon1, exon2, nbElements))) + return values + + def _compareExon(self, exon1, exon2, nbElements): + array = [0.0] * exon1.getSize() + if not exon1.overlapWith(exon2) or ((self._twoStrands) and exon1.getDirection() != exon2.getDirection()): + return array + for pos in range(max(exon1.getStart(), exon2.getStart()) - exon1.getStart(), min(exon1.getEnd(), exon2.getEnd()) - exon1.getStart()+1): + array[pos] += nbElements + return array + + def _filter(self, value): + if self._minOverlap and self._maxOverlap: + return self._minOverlap <= value <= self._maxOverlap + if self._minOverlap: + return self._minOverlap <= value + if self._maxOverlap: + return value <= self._maxOverlap + return True + + def _checkValues(self, values): + nbValues = sum(map(len, values)) + nbPosValues = sum(map(len, [filter(self._filter, valuePart) for valuePart in values])) + ratio = float(nbPosValues) / nbValues * 100 + if self._minNucleotides and nbPosValues < self._minNucleotides: + return False + if self._maxNucleotides and nbPosValues > self._maxNucleotides: + return False + if self._minPercent and ratio < self._minPercent: + return False + if self._maxPercent and ratio > self._maxPercent: + return False + return True + + def _printTranscript(self, transcript): + self._writer.addTranscript(transcript) + self._nbWritten += 1 + + def run(self): + for chromosome in sorted(self._splittedFileNames[QUERY].keys()): + self._compareChromosome(chromosome) + self._close() + if self._verbosity > 0: + print "# queries: %d" % (self._nbElements[QUERY]) + print "# refs: %d" % (self._nbElements[REFERENCE]) + print "# written: %d (%d%%)" % (self._nbWritten, 0 if self._nbElements[QUERY] == 0 else round(float(self._nbWritten) / self._nbElements[QUERY] * 100)) + + +if __name__ == "__main__": + description = "Restrict From Coverage v1.0.0: Select the elements from the first set which have a given coverage. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-n", "--minNucleotides", dest="minNucleotides", action="store", default=None, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int]") + parser.add_option("-N", "--maxNucleotides", dest="maxNucleotides", action="store", default=None, type="int", help="maximum number of nucleotides overlapping to declare an overlap [format: int]") + parser.add_option("-p", "--minPercent", dest="minPercent", action="store", default=None, type="int", help="minimum percentage of nucleotides overlapping to declare an overlap [format: int]") + parser.add_option("-P", "--maxPercent", dest="maxPercent", action="store", default=None, type="int", help="maximum percentage of nucleotides overlapping to declare an overlap [format: int]") + parser.add_option("-e", "--minOverlap", dest="minOverlap", action="store", default=None, type="int", help="minimum number of elements from 2nd file to declare an overlap [format: int]") + parser.add_option("-E", "--maxOverlap", dest="maxOverlap", action="store", default=None, type="int", help="maximum number of elements from 2nd file to declare an overlap [format: int]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider the two strands separately [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + rfc = RestrictFromCoverage(options.verbosity) + rfc.setInputFileName(options.inputFileName1, options.format1, QUERY) + rfc.setInputFileName(options.inputFileName2, options.format2, REFERENCE) + rfc.setOutputFileName(options.output) + rfc.setNbNucleotides(options.minNucleotides, options.maxNucleotides) + rfc.setPercent(options.minPercent, options.maxPercent) + rfc.setOverlap(options.minOverlap, options.maxOverlap) + rfc.setStrands(options.strands) + rfc.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/SelectByTag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/SelectByTag.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,148 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Select the transcript such that a tag value is not less than a given threshold""" +import os +import sys +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer import MySqlTranscriptWriter +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter + +class SelectByTag(object): + + def __init__(self, verbosity = 1): + self.input = None + self.format = None + self.tag = None + self.value = None + self.min = None + self.max = None + self.default = None + self.output = None + self.mysql = None + self.verbosity = verbosity + + self.parser = None + self.writer = None + self.mysqlWriter = None + self.nbElements = None + self.nbWritten = 0 + + + def setParser(self): + self.parser = TranscriptContainer(self.input, self.format, self.verbosity) + self.nbElements = self.parser.getNbTranscripts() + + + def setWriter(self): + self.writer = Gff3Writer(self.output, self.verbosity) + if self.mysql: + self.mysqlWriter = MySqlTranscriptWriter(self.output, self.verbosity) + + + def isAccepted(self, transcript): + value = transcript.getTagValue(self.tag) + if value == None: + if self.default != None: + value = self.default + else: + raise Exception("Error! Transcript %s no tag called '%s'" % (transcript, self.tag)) + if self.value != None: + if self.value == str(value): + return True + return self.value.isdigit() and value == float(self.value) + value = float(value) + return (self.min == None or self.min <= value) and (self.max == None or self.max >= value) + + + def readInputFile(self): + progress = Progress(self.parser.getNbTranscripts(), "Writing transcripts", self.verbosity) + for transcript in self.parser.getIterator(): + if self.isAccepted(transcript): + self.writer.addTranscript(transcript) + if self.mysql: + self.mysqlWriter.addTranscript(transcript) + self.nbWritten += 1 + progress.inc() + progress.done() + + + def writeFile(self): + self.writer.write() + if self.mysql: + self.mysqlWriter.write() + + + def run(self): + self.setParser() + self.setWriter() + self.readInputFile() + self.writeFile() + if self.verbosity > 0: + print "%d input" % (self.nbElements) + if self.nbElements != 0: + print "%d output (%.2f%%)" % (self.nbWritten, float(self.nbWritten) / self.nbElements * 100) + + + +if __name__ == "__main__": + + # parse command line + description = "Select by Tag v1.0.2: Keep the genomic coordinates such that a the value of a given tag is between two limits. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-g", "--tag", dest="tag", action="store", default=None, type="string", help="the tag [compulsory] [format: string]") + parser.add_option("-a", "--value", dest="value", action="store", default=None, type="string", help="the value to be found [format: string]") + parser.add_option("-m", "--min", dest="min", action="store", default=None, type="float", help="the minimum threshold [format: float]") + parser.add_option("-M", "--max", dest="max", action="store", default=None, type="float", help="the maximum threshold [format: float]") + parser.add_option("-d", "--default", dest="default", action="store", default=None, type="float", help="value if tag is not present [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="write output into MySQL tables [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + selectByTag = SelectByTag(options.verbosity) + selectByTag.input = options.inputFileName + selectByTag.format = options.format + selectByTag.tag = options.tag + selectByTag.value = options.value + selectByTag.min = options.min + selectByTag.max = options.max + selectByTag.default = options.default + selectByTag.output = options.outputFileName + selectByTag.mysql = options.mysql + selectByTag.run() + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/WrappGetDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/WrappGetDistribution.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,96 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + +if __name__ == "__main__": + + magnifyingFactor = 1000 + + # parse command line + description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outTarFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [compulsory] [format: file in FASTA format]") + parser.add_option("-n", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-w", "--raw", dest="raw", action="store_true", default=False, help="plot raw number of occurrences instead of density [format: bool] [default: false]") + parser.add_option("-x", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-g", "--gff", dest="gff", action="store_true", default=False, help="also write GFF3 file [format: bool] [default: false]") + parser.add_option("-H", "--height", dest="height", action="store", default=None, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=None, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool]") + (options, args) = parser.parse_args() + + + absPath = os.getcwd() + print "the current path is :", absPath + directory = "/tmp/wrappGetDistribution" + print "the dir path is :", directory + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName != None and options.format != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + cmd = "python %s/Java/Python/getDistribution.py -i %s -f %s -o %s -D %s" % (SMART_PATH, options.inputFileName, options.format, outputFileName, directory) + if options.referenceFileName != None : + cmd += " -r %s" % options.referenceFileName + if options.nbBins != None : + cmd += " -n %s" % options.nbBins + if options.chromosome : + cmd += " -c %s" % options.chromosome + if options.start != None : + cmd += " -s %s" % options.start + if options.end != None : + cmd += " -e %s" % options.end + if options.yMin != None : + cmd += " -y %s" % options.yMin + if options.yMax != None : + cmd += " -Y %s" % options.yMax + if options.height != None : + cmd += " -H %s" % options.height + if options.width != None : + cmd += " -W %s" % options.width + if options.bothStrands : + cmd += " -2" + if options.raw : + cmd += " -w" + if options.csv : + cmd += " -x" + if options.gff : + cmd += " -g" + if options.log : + cmd += " -l" + print "cmd is: ", cmd + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/WrappGetReadDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/WrappGetReadDistribution.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,58 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + +if __name__ == "__main__": + + # parse command line + description = "Get Read Distribution v1.0.1: Plot the number of identical reads and give the most represented. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file sequence [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: sequence file format]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="keep the best n [format: int]") + parser.add_option("-p", "--percent", dest="percent", action="store", default=None, type="float", help="keep the best n\% [format: float]") + parser.add_option("-o", "--output", dest="outTarFileName", action="store", type="string", help="output file [compulsory] [format: zip]") + + (options, args) = parser.parse_args() + + + absPath = os.getcwd() + print "the current path is :", absPath + directory = "/tmp/wrappGetReadDistribution" + print "the dir path is :", directory + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName != None and options.format != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + cmd = "python %s/Java/Python/getReadDistribution.py -i %s -f %s -o %s -D %s" % (SMART_PATH, options.inputFileName, options.format, outputFileName, directory) + if options.number != None : + cmd += " -n %s" % options.number + if options.percent != None : + cmd += " -p %s" % options.percent + print "cmd is: ", cmd + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/WrappPlotCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/WrappPlotCoverage.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,89 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + + +if __name__ == "__main__": + + # parse command line + description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]") + parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]") + parser.add_option("-o", "--output", dest="outTarFileName", action="store", type="string", help="output file [compulsory] [format: output file in zip format]") + parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]") + parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]") + parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]") + parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]") + parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]") + parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]") + parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]") + parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]") + parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]") + parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + absPath = os.getcwd() + directory = "/tmp/wrappPlotCov" + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName1 != None and options.inputFormat1 != None and options.inputFileName2 != None and options.inputFormat2 != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + print 'outputfile is :', outputFileName + cmd = "python %s/Java/Python/plotCoverage.py -i %s -f %s -j %s -g %s -o %s -D %s" % (SMART_PATH, options.inputFileName1, options.inputFormat1, options.inputFileName2, options.inputFormat2, outputFileName, directory) + if options.inputSequence!= None: + cmd += " -q %s" % options.inputSequence + if options.width != None: + cmd += " -w %s" % options.width + if options.height != None: + cmd += " -e %s" % options.height + if options.title != None: + cmd += " -t %s" % options.title + if options.xLabel != None: + cmd += " -x %s" % options.xLabel + if options.yLabel != None: + cmd += " -y %s" % options.yLabel + if options.plusColor != None: + cmd += " -p %s" % options.plusColor + if options.minusColor != None: + cmd += " -m %s" % options.minusColor + if options.sumColor != None: + cmd += " -s %s" % options.sumColor + if options.lineColor != None: + cmd += " -l %s" % options.lineColor + if options.merge: + cmd += " -1" + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + + + + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/WrappPlotRepartition.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/WrappPlotRepartition.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,71 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%sSMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + +if __name__ == "__main__": + + magnifyingFactor = 1000 + + # parse command line + description = "Plot the repartition of different data on a whole genome. (This tool uses 1 input file only, the different values being stored in the tags. See documentation to know more about it.) [Category: Visualization]" + + + parser = OptionParser(description = description) + parser.add_option("-i", "--input",dest="inputFileName",action="store",type="string",help="input file name [compulsory] [format: file in GFF3 format]") + parser.add_option("-n", "--names",dest="names", action="store", type="string", help="name for the tags (separated by commas and no space) [compulsory] [format: string]") + parser.add_option("-o", "--output",dest="outTarFileName",action="store",type="string", help="output file [compulsory] [format: output file tar format]") + parser.add_option("-c", "--color",dest="colors",action="store",default=None,type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-f", "--format",dest="format",action="store",default="png",type="string", help="format of the output file [format: string] [default: png]") + parser.add_option("-r", "--normalize",dest="normalize",action="store_true", default=False,help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-l", "--log",dest="log",action="store",default="",type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string]") + parser.add_option("-v", "--verbosity",dest="verbosity",action="store",default=1,type="int",help="trace level [format: int]") + (options, args) = parser.parse_args() + + + absPath = os.getcwd() + print "the current path is :", absPath + directory = "/tmp/wrappPlotRepartition" + print "the dir path is :", directory + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName != None and options.format != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + cmd = "python %s/Java/Python/plotRepartition.py -i %s -o %s -D %s" % (SMART_PATH, options.inputFileName, outputFileName, directory) + if options.names != None : + cmd += " -n %s" % options.names + else: print "You must choose tag names !" + if options.colors != None : + cmd += " -c %s" % options.colors + if options.format != None: + cmd += " -f %s" % options.format + if options.normalize : + cmd += " -r " + if options.log != "" : + cmd += " -l %s" % options.log + + print "cmd is: ", cmd + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/__init__.py diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/__init__.pyc Binary file smart_toolShed/SMART/Java/Python/__init__.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/adaptorStripper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/adaptorStripper.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,115 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Remove adaptors""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.Sequence import Sequence +from SMART.Java.Python.structure.SequenceList import SequenceList +from commons.core.parsing.FastaParser import FastaParser +from commons.core.writer.FastaWriter import FastaWriter +from SMART.Java.Python.misc.Progress import Progress + + +def distance (string1, string2): + if len(string1) != len(string2): + return None + distance = 0 + for i in range(0, len(string1)): + if string1[i] != string2[i]: + distance += 1 + return distance + + + +if __name__ == "__main__": + nbRemaining = 0 + + # parse command line + description = "Adaptor Stripper v1.0.1: Remove the adaptor of a list of reads. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-5", "--5primeAdaptor", dest="fivePrimeAdaptor", action="store", type="string", help="five prime adaptor [format: string]") + parser.add_option("-3", "--3primeAdaptor", dest="threePrimeAdaptor", action="store", type="string", help="three prime adaptor [format: string]") + parser.add_option("-d", "--5primeDist", dest="fivePrimeDistance", action="store", default=3, type="int", help="five prime distance [format: int] [default: 3]") + parser.add_option("-e", "--3primeDist", dest="threePrimeDistance", action="store", default=3, type="int", help="three prime distance [format: int [default: 3]]") + parser.add_option("-m", "--3primeSize", dest="threePrimeSize", action="store", default=10, type="int", help="three prime size [format: int] [default: 10]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.log: + logHandle = open(options.outputFileName + ".log", "w") + + + writer = FastaWriter(options.outputFileName + ".fas", options.verbosity) + sequenceParser = FastaParser(options.inputFileName, options.verbosity) + nbSequences = sequenceParser.getNbSequences() + + # treat sequences + progress = Progress(sequenceParser.getNbSequences(), "Analyzing " + options.inputFileName, options.verbosity) + for sequence in sequenceParser.getIterator(): + fivePrimeAdaptor = sequence.getSequence()[0:len(options.fivePrimeAdaptor)] + threePrimeAdaptor = sequence.getSequence()[len(sequence.sequence)-len(options.threePrimeAdaptor):] + + # check 5' adaptor + fivePrimeDistance = distance(fivePrimeAdaptor, options.fivePrimeAdaptor) + # check 3' adaptor + threePrimeDistance = len(threePrimeAdaptor) + for i in range(options.threePrimeSize, len(threePrimeAdaptor)+1): + threePrimeDistance = min(threePrimeDistance, distance(threePrimeAdaptor[-i:], options.threePrimeAdaptor[:i])) + + # sort candidates + if fivePrimeDistance > options.fivePrimeDistance: + if options.log: + logHandle.write("Sequence %s does not start with the right adaptor (%s != %s)\n" % (sequence.getSequence(), fivePrimeAdaptor, options.fivePrimeAdaptor)) + elif threePrimeDistance > options.threePrimeDistance: + if options.log: + logHandle.write("Sequence %s does not end with the right adaptor (%s != %s)\n" % (sequence.getSequence(), threePrimeAdaptor, options.threePrimeAdaptor)) + else: + nbRemaining += 1 + sequence.setSequence(sequence.getSequence()[len(options.fivePrimeAdaptor):len(sequence.getSequence())-len(options.threePrimeAdaptor)]) + writer.addSequence(sequence) + + progress.inc() + + progress.done() + + if options.log: + logHandle.close() + + writer.write() + + print "kept %i over %i (%.f%%)" % (nbRemaining, nbSequences, float(nbRemaining) / nbSequences * 100) + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/changeGffFeatures.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/changeGffFeatures.sh Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,2 @@ +#!/bin/bash +sed "s/\t$2\t/\t$3\t/g" $1 diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/changeTagName.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/changeTagName.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,90 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Change the name of a tag +""" + +import os +import random +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from commons.core.writer.Gff3Writer import Gff3Writer + + +if __name__ == "__main__": + + # parse command line + description = "Change Tag Name v1.0.1: Change the name of tag of a list of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-t", "--tag", dest="tag", action="store", type="string", help="name of the tag to change [compulsory] [format: string]") + parser.add_option("-n", "--name", dest="name", action="store", type="string", help="new name for the tag [compulsory] [format: string]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="mySQL output [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.log: + logHandle = open("%s.log" % options.outputFileName, "w") + + # create parser and writer(s) + parser = TranscriptContainer(options.inputFileName, options.inputFormat, options.verbosity) + tmpFileName = "tmpTranscriptFile%d.gff3" % (random.randint(0, 100000)) + writer = Gff3Writer(tmpFileName, options.verbosity) + if options.mysql: + mysqlWriter = MySqlTranscriptWriter(options.outputFileName, options.verbosity) + outputData = {} + + # process transcripts + progress = Progress(parser.getNbTranscripts(), "Printing transcripts %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + if options.tag in transcript.tags: + value = transcript.tags[options.tag] + del transcript.tags[options.tag] + transcript.tags[options.name] = value + writer.addTranscript(transcript) + if options.mysql: + mysqlWriter.addTranscript(transcript) + progress.inc() + progress.done() + parser.transcriptListParser.close() + + writer.write() + + if options.mysql: + mysqlWriter.write() + + os.rename(tmpFileName, options.outputFileName) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/cleanGff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/cleanGff.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,195 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Clean a GFF file (as given by NCBI or TAIR) and outputs a GFF3 file. +""" + +import os +import re +from optparse import OptionParser +from commons.core.parsing.GffParser import * +from SMART.Java.Python.misc.RPlotter import * +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +count = {} + +class ParsedLine(object): + def __init__(self, line, cpt): + self.line = line + self.cpt = cpt + self.parse() + + def parse(self): + self.line = self.line.strip() + self.splittedLine = self.line.split(None, 8) + if len(self.splittedLine) < 9: + raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) + self.type = self.splittedLine[2] + self.parseOptions() + self.getId() + self.getParents() + + def parseOptions(self): + self.parsedOptions = {} + for option in self.splittedLine[8].split(";"): + option = option.strip() + if option == "": continue + posSpace = option.find(" ") + posEqual = option.find("=") + if posEqual != -1 and (posEqual < posSpace or posSpace == -1): + key, value = option.split("=", 1) + elif posSpace != -1: + key, value = option.split(None, 1) + else: + key = "ID" + value = option + self.parsedOptions[key.strip()] = value.strip(" \"") + + def getId(self): + for key in self.parsedOptions: + if key.lower() == "id": + self.id = self.parsedOptions[key] + return + if "Parent" in self.parsedOptions: + parent = self.parsedOptions["Parent"].split(",")[0] + if parent not in count: + count[parent] = {} + if self.type not in count[parent]: + count[parent][self.type] = 0 + count[parent][self.type] += 1 + self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type]) + else: + self.id = "smart%d" % (self.cpt) + self.parsedOptions["ID"] = self.id + + def getParents(self): + for key in self.parsedOptions: + if key.lower() in ("parent", "derives_from"): + self.parents = self.parsedOptions[key].split(",") + return + self.parents = None + + def removeParent(self): + for key in self.parsedOptions.keys(): + if key.lower() in ("parent", "derives_from"): + del self.parsedOptions[key] + + def export(self): + self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()]) + return "%s\n" % ("\t".join(self.splittedLine)) + + +class CleanGff(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.lines = {} + self.acceptedTypes = [] + self.parents = [] + self.children = {} + + def setInputFileName(self, name): + self.inputFile = open(name) + + def setOutputFileName(self, name): + self.outputFile = open(name, "w") + + def setAcceptedTypes(self, types): + self.acceptedTypes = types + + def parse(self): + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) + for cpt, line in enumerate(self.inputFile): + if not line or line[0] == "#": continue + if line[0] == ">": break + parsedLine = ParsedLine(line, cpt) + if parsedLine.type in self.acceptedTypes: + self.lines[parsedLine.id] = parsedLine + progress.inc() + progress.done() + + def sort(self): + progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity) + for line in self.lines.values(): + parentFound = False + if line.parents: + for parent in line.parents: + if parent in self.lines: + parentFound = True + if parent in self.children: + self.children[parent].append(line) + else: + self.children[parent] = [line] + if not parentFound: + line.removeParent() + self.parents.append(line) + progress.inc() + progress.done() + + def write(self): + progress = Progress(len(self.parents), "Writing output file", self.verbosity) + for line in self.parents: + self.writeLine(line) + progress.inc() + self.outputFile.close() + progress.done() + + def writeLine(self, line): + self.outputFile.write(line.export()) + if line.id in self.children: + for child in self.children[line.id]: + self.writeLine(child) + + def run(self): + self.parse() + self.sort() + self.write() + + +if __name__ == "__main__": + + # parse command line + description = "Clean GFF v1.0.3: Clean a GFF file (as given by NCBI) and outputs a GFF3 file. [Category: Other]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-t", "--types", dest="types", action="store", default="mRNA,exon", type="string", help="list of comma-separated types that you want to keep [format: string] [default: mRNA,exon]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cleanGff = CleanGff(options.verbosity) + cleanGff.setInputFileName(options.inputFileName) + cleanGff.setOutputFileName(options.outputFileName) + cleanGff.setAcceptedTypes(options.types.split(",")) + cleanGff.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/cleaning/CleanerChooser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/cleaning/CleanerChooser.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,80 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.cleaning.GffCleaner import GffCleaner +from SMART.Java.Python.cleaning.GtfCleaner import GtfCleaner +from SMART.Java.Python.cleaning.DefaultCleaner import DefaultCleaner + +#Attention!! Do not delete the imports!! They are used to know the type of file format!!! + +class CleanerChooser(object): + """ + A class that finds the correct cleaner + @ivar format: the format + @type format: string + @ivar cleaner: the parser + @type cleaner: object + @ivar cleanerClass: the class of the parser + @type cleanerClass: class + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + """ + self.verbosity = verbosity + + + def findFormat(self, format): + """ + Find the correct parser + @ivar format: the format + @type format: string + @return: a cleaner + """ + for cleanerClass in TranscriptListCleaner.__subclasses__(): + if cleanerClass != None: + if cleanerClass.getFileFormats() != None and format in cleanerClass.getFileFormats(): + self.cleanerClass = cleanerClass + return + self.cleanerClass = DefaultCleaner + + + def getCleaner(self): + """ + Get the parser previously found + @return: the parser + """ + return self.cleanerClass(self.verbosity) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/cleaning/DefaultCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/cleaning/DefaultCleaner.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,45 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Default cleaner. Does nothing but copying. +""" +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + + +class DefaultCleaner(TranscriptListCleaner): + + def __init__(self, verbosity = 1): + super(DefaultCleaner, self).__init__(verbosity) + + def _clean(self): + self.outputHandle.write(self.inputHandle.read()) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/cleaning/GffCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/cleaning/GffCleaner.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,168 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Clean a GFF file (as given by NCBI or TAIR) and outputs a GFF3 file. +""" + +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +count = {} + +class ParsedLine(object): + def __init__(self, line, cpt): + self.line = line + self.cpt = cpt + self.parse() + + def parse(self): + self.line = self.line.strip() + self.splittedLine = self.line.split(None, 8) + if len(self.splittedLine) < 9: + raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) + self.type = self.splittedLine[2] + self.parseOptions() + self.getId() + self.getParents() + + def parseOptions(self): + self.parsedOptions = {} + for option in self.splittedLine[8].split(";"): + option = option.strip() + if option == "": continue + posSpace = option.find(" ") + posEqual = option.find("=") + if posEqual != -1 and (posEqual < posSpace or posSpace == -1): + key, value = option.split("=", 1) + elif posSpace != -1: + key, value = option.split(None, 1) + else: + key = "ID" + value = option + self.parsedOptions[key.strip()] = value.strip(" \"") + + def getId(self): + for key in self.parsedOptions: + if key.lower() == "id": + self.id = self.parsedOptions[key] + return + if "Parent" in self.parsedOptions: + parent = self.parsedOptions["Parent"].split(",")[0] + if parent not in count: + count[parent] = {} + if self.type not in count[parent]: + count[parent][self.type] = 0 + count[parent][self.type] += 1 + self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type]) + else: + self.id = "smart%d" % (self.cpt) + self.parsedOptions["ID"] = self.id + + def getParents(self): + for key in self.parsedOptions: + if key.lower() in ("parent", "derives_from"): + self.parents = self.parsedOptions[key].split(",") + return + self.parents = None + + def removeParent(self): + for key in self.parsedOptions.keys(): + if key.lower() in ("parent", "derives_from"): + del self.parsedOptions[key] + + def export(self): + self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()]) + return "%s\n" % ("\t".join(self.splittedLine)) + + +class GffCleaner(TranscriptListCleaner): + + def __init__(self, verbosity = 1): + super(GffCleaner, self).__init__(verbosity) + self.lines = {} + self.acceptedTypes = ["mRNA", "transcript", "exon"] + self.parents = [] + self.children = {} + + def getFileFormats(): + return ["gff", "gff2", "gff3"] + getFileFormats = staticmethod(getFileFormats) + + def setAcceptedTypes(self, types): + self.acceptedTypes = types + + def parse(self): + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) + for cpt, line in enumerate(self.inputHandle): + if not line or line[0] == "#": continue + if line[0] == ">": break + parsedLine = ParsedLine(line, cpt) + if self.acceptedTypes == None or parsedLine.type in self.acceptedTypes: + self.lines[parsedLine.id] = parsedLine + progress.inc() + progress.done() + + def sort(self): + progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity) + for line in self.lines.values(): + parentFound = False + if line.parents: + for parent in line.parents: + if parent in self.lines: + parentFound = True + if parent in self.children: + self.children[parent].append(line) + else: + self.children[parent] = [line] + if not parentFound: + line.removeParent() + self.parents.append(line) + progress.inc() + progress.done() + + def write(self): + progress = Progress(len(self.parents), "Writing output file", self.verbosity) + for line in self.parents: + self.writeLine(line) + progress.inc() + progress.done() + + def writeLine(self, line): + self.outputHandle.write(line.export()) + if line.id in self.children: + for child in self.children[line.id]: + self.writeLine(child) + + def _clean(self): + self.parse() + self.sort() + self.write() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/cleaning/GtfCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/cleaning/GtfCleaner.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,121 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Clean a GTF file +""" + +import shlex +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +count = {} + +class ParsedLine(object): + def __init__(self, line, cpt): + self.line = line + self.cpt = cpt + self.parse() + + def parse(self): + self.line = self.line.strip() + self.splittedLine = self.line.split(None, 8) + if len(self.splittedLine) < 9: + raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) + self.type = self.splittedLine[2] + self.parseOptions() + + def parseOptions(self): + self.parsedOptions = {} + key = None + value = "" + for option in shlex.split(self.splittedLine[8]): + option = option.strip() + if option == "": continue + if key == None: + key = option + else: + endValue = False + if option[-1] == ";": + endValue = True + option.rstrip(";") + value = "%s \"%s\"" % (value, option) + if endValue: + self.parsedOptions[key] = value + if key == "transcript_id": + self.transcriptId = value + key = None + value = "" + + def export(self): + return "%s\n" % (self.line) + + +class GtfCleaner(TranscriptListCleaner): + + def __init__(self, verbosity = 1): + super(GtfCleaner, self).__init__(verbosity) + self.acceptedTypes = ["exon"] + self.parents = {} + + def getFileFormats(): + return ["gtf"] + getFileFormats = staticmethod(getFileFormats) + + def setAcceptedTypes(self, types): + self.acceptedTypes = types + + def parse(self): + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) + for cpt, line in enumerate(self.inputHandle): + if not line or line[0] == "#": continue + parsedLine = ParsedLine(line, cpt) + if self.acceptedTypes == None or parsedLine.type in self.acceptedTypes: + transcriptId = parsedLine.transcriptId + if transcriptId not in self.parents: + self.parents[parsedLine.transcriptId] = [parsedLine] + else: + self.parents[parsedLine.transcriptId].append(parsedLine) + progress.inc() + progress.done() + + def write(self): + progress = Progress(len(self.parents.keys()), "Writing output file", self.verbosity) + for parent in sorted(self.parents.keys()): + for line in self.parents[parent]: + self.outputHandle.write(line.export()) + progress.inc() + progress.done() + + def _clean(self): + self.parse() + self.write() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/cleaning/TranscriptListCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/cleaning/TranscriptListCleaner.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,63 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class TranscriptListCleaner(object): + """A (quite generic) class that cleans a file containing transcripts""" + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + + def setInputFileName(self, fileName): + try: + self.inputHandle = open(fileName) + except IOError: + raise Exception("Error! Transcript file '%s' does not exist! Exiting..." % (self.fileName)) + + def setOutputFileName(self, fileName): + try: + self.outputHandle = open(fileName, "w") + except IOError: + raise Exception("Error! Transcript file '%s' does not exist! Exiting..." % (self.fileName)) + + def getFileFormats(): + pass + getFileFormats = staticmethod(getFileFormats) + + def close(self): + self.inputHandle.close() + self.outputHandle.close() + + def clean(self): + self._clean() + self.close() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/cleaning/__init__.py diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/clusterize.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/clusterize.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,165 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.WriterChooser import WriterChooser +"""Clusterize a set of transcripts""" + +import os +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress + +class Clusterize(object): + + def __init__(self, verbosity): + self.normalize = False + self.presorted = False + self.distance = 1 + self.colinear = False + self.nbWritten = 0 + self.nbMerges = 0 + self.verbosity = verbosity + self.splittedFileNames = {} + + def __del__(self): + for fileName in self.splittedFileNames.values(): + os.remove(fileName) + + def setInputFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format) + self.parser = parserChooser.getParser(fileName) + self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): + writerChooser = WriterChooser() + writerChooser.findFormat(format) + self.writer = writerChooser.getWriter(fileName) + self.writer.setTitle(title) + self.writer.setFeature(feature) + self.writer.setFeaturePart(featurePart) + + def setDistance(self, distance): + self.distance = distance + + def setColinear(self, colinear): + self.colinear = colinear + + def setNormalize(self, normalize): + self.normalize = normalize + + def setPresorted(self, presorted): + self.presorted = presorted + + def _sortFile(self): + fs = FileSorter(self.parser, self.verbosity-4) + fs.perChromosome(True) + fs.setPresorted(self.presorted) + fs.setOutputFileName(self.sortedFileName) + fs.sort() + self.splittedFileNames = fs.getOutputFileNames() + self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self.nbElements = fs.getNbElements() + + def _iterate(self, chromosome): + progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) + transcripts = [] + parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) + for newTranscript in parser.getIterator(): + newTranscripts = [] + for oldTranscript in transcripts: + if self._checkOverlap(newTranscript, oldTranscript): + self._merge(newTranscript, oldTranscript) + elif self._checkPassed(newTranscript, oldTranscript): + self._write(oldTranscript) + else: + newTranscripts.append(oldTranscript) + newTranscripts.append(newTranscript) + transcripts = newTranscripts + progress.inc() + for transcript in transcripts: + self._write(transcript) + progress.done() + + def _merge(self, transcript1, transcript2): + self.nbMerges += 1 + transcript2.setDirection(transcript1.getDirection()) + transcript1.merge(transcript2) + + def _write(self, transcript): + self.nbWritten += 1 + self.writer.addTranscript(transcript) + + def _checkOverlap(self, transcript1, transcript2): + if self.colinear and transcript1.getDirection() != transcript2.getDirection(): + return False + if transcript1.getDistance(transcript2) > self.distance: + return False + return True + + def _checkPassed(self, transcript1, transcript2): + return (transcript1.getDistance(transcript2) > self.distance) + + def run(self): + self._sortFile() + for chromosome in sorted(self.splittedFileNames.keys()): + self._iterate(chromosome) + self.writer.close() + if self.verbosity > 0: + print "# input: %d" % (self.nbElements) + print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) + print "# merges: %d" % (self.nbMerges) + + +if __name__ == "__main__": + description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") + parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") + parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + c = Clusterize(options.verbosity) + c.setInputFile(options.inputFileName, options.format) + c.setOutputFileName(options.outputFileName, options.outputFormat) + c.setColinear(options.colinear) + c.setDistance(options.distance) + c.setNormalize(options.normalize) + c.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/clusterizeBySlidingWindows.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/clusterizeBySlidingWindows.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,344 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +from commons.core.writer.WriterChooser import WriterChooser +""" +Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. +""" + +import os, os.path +from optparse import OptionParser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.Gff3Writer import Gff3Writer + +class ClusterizeBySlidingWindows(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.strands = (0, ) + self.normalize = False + self.plot = None + self.excel = None + self.outputFileName = '' + self.defaultValue = None + + def __del__(self): + pass + + def setInputFile(self, fileName, format): + self.parser = TranscriptContainer(fileName, format, self.verbosity) + + def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"): + writerChooser = WriterChooser(self.verbosity) + writerChooser.findFormat(format) + self.writer = writerChooser.getWriter(fileName) + self.writer.setTitle(title) + self.writer.setFeature(feature) + self.writer.setFeaturePart(featurePart) +# self.outputFileName = fileName +# self.outputFormat = format + + def setWindowSize(self, size): + self.size = size + + def setWindowOverlap(self, overlap): + self.overlap = overlap + + def setTag(self, tag): + self.tag = tag + + def setOperation(self, operation): + self.operation = operation + + def setBothStrands(self, bothStrands): + if bothStrands: + self.strands = (-1, 1) + + def setNormalize(self, normalize): + self.normalize = normalize + + def setPlot(self, plot): + self.plot = plot + + def setExcel(self, excel): + self.excel = excel + + def setOutputTag(self, tag): + self.outputTagName = tag + + def setDefaultValue(self, defaultValue): + self.defaultValue = defaultValue + + def checkOptions(self): +# if self.operation != None: +# raise Exception("Trying to combine the values without specifying tag! Aborting...") + if self.operation != None and self.operation not in ("sum", "avg", "med", "min", "max"): + raise Exception("Do not understand tag '%s'! Aborting..." % (self.operation)) + + def getChromosomeSizes(self): + self.sizes = {} + progress = Progress(self.parser.getNbTranscripts(), "Getting sizes in genome", self.verbosity) + for transcript in self.parser.getIterator(): + self.sizes[transcript.getChromosome()] = max(transcript.getStart(), self.sizes.get(transcript.getChromosome(), 0)) + progress.inc() + progress.done() + + def getBinsFromPos(self, pos): + bin = (pos - 1) / (self.size - self.overlap) + if bin >= 1 and pos <= bin * (self.size - self.overlap) + self.overlap: + return (bin - 1, bin) + return (bin, ) + + def getPosFromBin(self, bin): + return (bin * (self.size - self.overlap) + 1, bin * (self.size - self.overlap) + self.size) + + def initializeBins(self): + self.binsPerStrand = {} + self.sumsPerStrand = {} + self.valuesPerStrand = {} + self.toBePlottedPerStrand = {} + for strand in self.strands: + self.binsPerStrand[strand] = {} + self.sumsPerStrand[strand] = {} + self.valuesPerStrand[strand] = {} + self.toBePlottedPerStrand[strand] = {} + for chromosome in self.sizes: + binRange = range(self.getBinsFromPos(self.sizes[chromosome])[-1] + 1) + self.binsPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange]) + self.sumsPerStrand[strand][chromosome] = dict([[i, 0.0] for i in binRange]) + self.valuesPerStrand[strand][chromosome] = dict([[i, []] for i in binRange]) + self.toBePlottedPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange]) + + def getNbElements(self, transcript): + nbOccurrences = 1 if "nbOccurrences" not in transcript.getTagNames() else transcript.getTagValue("nbOccurrences") + nbElements = 1 if "nbElements" not in transcript.getTagNames() else transcript.getTagValue("nbElements") + nbOccurrences = float(nbOccurrences) + nbElements = float(nbElements) + nbElements /= float(nbOccurrences) + return nbElements + + def setBins(self): + progress = Progress(self.parser.getNbTranscripts(), "Setting bins", self.verbosity) + for transcript in self.parser.getIterator(): + nbElements = self.getNbElements(transcript) + strand = transcript.getDirection() if len(self.strands) == 2 else 0 + for bin in self.getBinsFromPos(transcript.getStart()): + self.binsPerStrand[strand][transcript.getChromosome()][bin] += nbElements + if self.tag != None: + if self.tag not in transcript.getTagNames(): + if self.defaultValue is None: + raise Exception("Tag %s undefined in transcript %s" % (self.tag, transcript)) + value = self.defaultValue + else: + value = float(transcript.getTagValue(self.tag)) + self.sumsPerStrand[strand][transcript.getChromosome()][bin] += value + self.valuesPerStrand[strand][transcript.getChromosome()][bin].append(value) + progress.inc() + progress.done() + + def aggregateData(self): + if self.operation == "sum": + self.computeSumData() + elif self.operation == "avg": + self.computeAvgData() + elif self.operation == "med": + self.computeMedData() + elif self.operation == "min": + self.computeMinData() + elif self.operation == "max": + self.computeMaxData() + elif self.operation == "GCpercent": + self.computeGCPercent() + else: + self.toBePlottedPerStrand = self.binsPerStrand + + def computeSumData(self): + self.toBePlottedPerStrand = self.sumsPerStrand + + def computeAvgData(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.binsPerStrand[strand][chromosome][bin] != 0: + self.toBePlottedPerStrand[strand][chromosome][bin] = float(self.sumsPerStrand[strand][chromosome][bin]) / self.binsPerStrand[strand][chromosome][bin] + + def computeMedData(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.valuesPerStrand[strand][chromosome][bin]: + self.valuesPerStrand[strand][chromosome][bin].sort() + size = len(self.valuesPerStrand[strand][chromosome][bin]) + if size % 2 == 1: + self.toBePlottedPerStrand[strand][chromosome][bin] = self.valuesPerStrand[strand][chromosome][bin][(size - 1) / 2] + else: + self.toBePlottedPerStrand[strand][chromosome][bin] = (self.valuesPerStrand[strand][chromosome][bin][size / 2 - 1] + self.valuesPerStrand[strand][chromosome][bin][size / 2]) / 2.0 + + def computeMinData(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.valuesPerStrand[strand][chromosome][bin]: + self.toBePlottedPerStrand[strand][chromosome][bin] = min(self.valuesPerStrand[strand][chromosome][bin]) + + def computeMaxData(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.valuesPerStrand[strand][chromosome][bin]: + self.toBePlottedPerStrand[strand][chromosome][bin] = max(self.valuesPerStrand[strand][chromosome][bin]) + + def computeGCPercent(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.valuesPerStrand[strand][chromosome][bin]: + subSequence = self.valuesPerStrand[strand][chromosome][bin] + NPercent = 100 * (subSequence.countNt("N") / float(subSequence.getSize())) + if NPercent >= 50: + currentGCpercent = "NA" + else: + currentGCpercent = subSequence.getGCpercentageInSequenceWithoutCountNInLength() + + self.toBePlottedPerStrand[strand][chromosome][bin] = currentGCpercent + #TODO: see if a map method could be used for the various "compute" methods + #return currentGCpercent, NPercent + + def plotData(self): + if self.plot != None: + for strand in self.strands: + adjunct = "" + if strand != 0: + adjunct = "Strand%d" % (strand) + for chromosome in self.toBePlottedPerStrand[strand]: + if len(self.toBePlottedPerStrand[strand][chromosome].keys()) > 0: + plotter = RPlotter(self.plot, self.verbosity) + plotter.setFill(0) + plotter.addLine(self.toBePlottedPerStrand[strand][chromosome], chromosome) + plotter.plot() + + def writeExcel(self): + if self.excel != None: + excelFile = open(self.excel, "w") + for strand in self.strands: + maxBin = max([max(self.toBePlottedPerStrand[strand][chromosome].keys()) for chromosome in self.binsPerStrand[strand]]) + for bin in range(0, maxBin + 1): + excelFile.write(",%d-%d" % self.getPosFromBin(bin)) + excelFile.write("\n") + for chromosome in self.toBePlottedPerStrand[strand]: + excelFile.write("%s" % (chromosome)) + for bin in self.toBePlottedPerStrand[strand][chromosome]: + excelFile.write(",%f" % (self.toBePlottedPerStrand[strand][chromosome][bin])) + excelFile.write("\n") + excelFile.close() + + def printRegions(self): + cpt = 1 + tagOp = "nb" + tagName = "Elements" + outputTagName = "nbElements" + if self.operation != None: + tagOp = self.operation.lower() + if self.tag != None: + tagName = self.tag.title() + if self.outputTagName != None: + outputTagName = self.outputTagName + + + #writer = Gff3Writer(self.outputFileName, self.verbosity) + + for strand in self.strands: + for chromosome in self.toBePlottedPerStrand[strand]: + for bin in self.toBePlottedPerStrand[strand][chromosome]: + transcript = Transcript() + transcript.setName("region%d" % cpt) + transcript.setChromosome(chromosome) + transcript.setStart(self.getPosFromBin(bin)[0]) + transcript.setEnd(self.getPosFromBin(bin)[1]) + transcript.setDirection(1 if strand == 0 else strand) + transcript.setTagValue(outputTagName, self.binsPerStrand[strand][chromosome][bin]) + transcript.setTagValue("%s%s" % (tagOp, tagName), str(self.toBePlottedPerStrand[strand][chromosome][bin])) + self.writer.addTranscript(transcript) + cpt += 1 + self.writer.close() + + def run(self): + self.checkOptions() + self.getChromosomeSizes() + self.initializeBins() + self.setBins() + self.aggregateData() + if self.excel: + self.writeExcel() + if self.plot: + self.plotData() + self.printRegions() + + +if __name__ == "__main__": + + # parse command line + description = "Clusterize by Sliding Windows v1.0.1: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") + parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="format of the output file [format: transcript file format]") + parser.add_option("-s", "--size", dest="size", action="store", type="int", help="size of the regions [compulsory] [format: int]") + parser.add_option("-e", "--overlap", dest="overlap", action="store", type="int", help="overlap between two consecutive regions [compulsory] [format: int]") + parser.add_option("-m", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") + parser.add_option("-g", "--tag", dest="tag", action="store", default=None, type="string", help="use a given tag as input (instead of summing number of features) [format: string]") + parser.add_option("-r", "--operation", dest="operation", action="store", default=None, type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]") + parser.add_option("-d", "--defaultValue",dest="defaultValue", action="store", type="float", help="default value for input tag [format: float]") + parser.add_option("-w", "--write", dest="writeTag", action="store", default=None, type="string", help="print the result in the given tag (default usually is 'nbElements') [format: string]") + parser.add_option("-2", "--strands", dest="strands", action="store_true", default=False, help="consider the two strands separately [format: bool] [default: false]") + parser.add_option("-p", "--plot", dest="plot", action="store", default=None, type="string", help="plot regions to the given file [format: output file in PNG format]") + parser.add_option("-x", "--excel", dest="excel", action="store", default=None, type="string", help="write an Excel file to the given file [format: output file in Excel format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + cbsw = ClusterizeBySlidingWindows(options.verbosity) + cbsw.setInputFile(options.inputFileName, options.inputFormat) + cbsw.setOutputFileName(options.outputFileName, options.outputFormat) + cbsw.setWindowSize(options.size) + cbsw.setWindowOverlap(options.overlap) + cbsw.setTag(options.tag) + cbsw.setDefaultValue(options.defaultValue) + cbsw.setOperation(options.operation) + cbsw.setOutputTag(options.writeTag) + cbsw.setBothStrands(options.strands) + cbsw.setPlot(options.plot) + cbsw.setExcel(options.excel) + cbsw.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/compareOverlapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/compareOverlapping.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,126 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Compare overlap of two transcript lists""" +import sys +import os +from optparse import OptionParser +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.misc.RPlotter import RPlotter +from commons.core.writer.Gff3Writer import Gff3Writer + +class CompareOverlapping(object): + + def __init__(self): + self._options = None + + + def setAttributesFromCmdLine(self): + description = "Compare Overlapping v1.0.3: Get the data which overlap with a reference set. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]") + parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]") + parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]") + parser.add_option("-u", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]") + parser.add_option("-t", "--intron", dest="introns", action="store_true", default=False, help="also report introns [format: bool] [default: false]") + parser.add_option("-E", "--5primeExtension1", dest="fivePrime1", action="store", default=None, type="int", help="extension towards 5' in file 1 [format: int]") + parser.add_option("-e", "--5primeExtension2", dest="fivePrime2", action="store", default=None, type="int", help="extension towards 5' in file 2 [format: int]") + parser.add_option("-N", "--3primeExtension1", dest="threePrime1", action="store", default=None, type="int", help="extension towards 3' in file 1 [format: int]") + parser.add_option("-n", "--3primeExtension2", dest="threePrime2", action="store", default=None, type="int", help="extension towards 3' in file 2 [format: int]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=None, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]") + parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]") + parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=1, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]") + parser.add_option("-p", "--pcOverlap", dest="pcOverlap", action="store", default=None, type="int", help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (self._options, args) = parser.parse_args() + + + def run(self): + logHandle = None + if self._options.log: + logHandle = open(self._options.output, "w") + + transcriptContainer1 = TranscriptContainer(self._options.inputFileName1, self._options.format1, self._options.verbosity) + transcriptContainer2 = TranscriptContainer(self._options.inputFileName2, self._options.format2, self._options.verbosity) + writer = TranscriptWriter(self._options.output, "gff3", self._options.verbosity) + + transcriptListComparator = TranscriptListsComparator(logHandle, self._options.verbosity) + transcriptListComparator.restrictToStart(transcriptListComparator.QUERY, self._options.start1) + transcriptListComparator.restrictToStart(transcriptListComparator.REFERENCE, self._options.start2) + transcriptListComparator.restrictToEnd(transcriptListComparator.QUERY, self._options.end1) + transcriptListComparator.restrictToEnd(transcriptListComparator.REFERENCE, self._options.end2) + transcriptListComparator.extendFivePrime(transcriptListComparator.QUERY, self._options.fivePrime1) + transcriptListComparator.extendFivePrime(transcriptListComparator.REFERENCE, self._options.fivePrime2) + transcriptListComparator.extendThreePrime(transcriptListComparator.QUERY, self._options.threePrime1) + transcriptListComparator.extendThreePrime(transcriptListComparator.REFERENCE, self._options.threePrime2) + transcriptListComparator.acceptIntrons(transcriptListComparator.QUERY, self._options.introns) + transcriptListComparator.acceptIntrons(transcriptListComparator.REFERENCE, self._options.introns) + transcriptListComparator.getAntisenseOnly(self._options.antisense) + transcriptListComparator.getColinearOnly(self._options.colinear) + transcriptListComparator.getInvert(self._options.exclude) + transcriptListComparator.setMaxDistance(self._options.distance) + transcriptListComparator.setMinOverlap(self._options.minOverlap) + transcriptListComparator.setPcOverlap(self._options.pcOverlap) + transcriptListComparator.setIncludedOnly(self._options.included) + transcriptListComparator.setIncludingOnly(self._options.including) + transcriptListComparator.includeNotOverlapping(self._options.notOverlapping) + transcriptListComparator.computeOdds(True) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer1) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.REFERENCE, transcriptContainer2) + transcriptListComparator.setOutputWriter(writer) + transcriptListComparator.compareTranscriptList() + + if self._options.log: + logHandle.close() + + if not self._options.exclude: + odds = transcriptListComparator.getOdds() + if self._options.verbosity > 0 and odds: + print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(odds) + +if __name__ == "__main__": + icompareOverlapping = CompareOverlapping() + icompareOverlapping.setAttributesFromCmdLine() + icompareOverlapping.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/convertTranscriptFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/convertTranscriptFile.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,115 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Read a transcript file and convert it to another format +""" + +import os, re +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.misc.Progress import Progress + + +class ConvertTranscriptFile(object): + def __init__(self,inputFileName="", inputFormat ="", outputFileName="", outputFormat="", name="", sequenceFileName=None, strands=False, galaxy=False, feature=None, featurePart=None, verbosity=1): + self.inputFileName = inputFileName + self.inputFormat = inputFormat + self.outputFileName = outputFileName + self.outputFormat = outputFormat + self.name = name + self.sequenceFileName = sequenceFileName + self.strands = strands + self.galaxy = galaxy + + self.feature=feature + self.featurePart=featurePart + + self.verbosity = verbosity + + def setAttributesFromCmdLine(self): + description = "Convert Transcript File v1.0.3: Convert a file from a format to another. [Category: Conversion]" + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript or mapping file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in format given by -g]") + parser.add_option("-g", "--outputFormat", dest="outputFormat", action="store", type="string", help="format of the output file [compulsory] [format: transcript file format]") + parser.add_option("-n", "--name", dest="name", action="store", default="SMART", type="string", help="name for the transcripts [format: string] [default: SMART]") + parser.add_option("-s", "--sequences", dest="sequenceFileName", action="store", default=None, type="string", help="give the corresponding Multi-Fasta file (useful for EMBL format) [format: string]") + parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="consider the 2 strands as different (only useful for writing WIG files) [format: bool] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-G", "--galaxy", dest="galaxy", action="store_true", default=False, help="used for galaxy [format: bool] [default: False]") + (options, args) = parser.parse_args() + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.inputFileName = options.inputFileName + self.inputFormat = options.inputFormat + self.outputFileName = options.outputFileName + self.outputFormat = options.outputFormat + self.name = options.name + self.sequenceFileName = options.sequenceFileName + self.strands = options.strands + self.galaxy = options.galaxy + self.verbosity = options.verbosity + + def run(self): + # create parser + parser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity) + # create writer + writer = TranscriptWriter(self.outputFileName, self.outputFormat, self.verbosity) + # connect parser and writer + writer.setContainer(parser) + + if self.name != None: + writer.setTitle(self.name) + if self.feature != None: + writer.setFeature(self.feature) + if self.featurePart != None: + writer.setFeaturePart(self.featurePart) + if self.sequenceFileName != None: + writer.addSequenceFile(self.sequenceFileName) + + nbItems = 0 + if self.verbosity > 0: + nbItems = parser.getNbItems() + print "%i items found" % (nbItems) + + if self.strands: + writer.setStrands(True) + # convert + writer.write() + writer.close() + +if __name__ == "__main__": + iConvertTranscriptFile = ConvertTranscriptFile() + iConvertTranscriptFile.setAttributesFromCmdLine() + iConvertTranscriptFile.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/coordinatesToSequence.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/coordinatesToSequence.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,64 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Convert a list of coordinates to sequences""" + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.FastaWriter import FastaWriter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Coordinates to Sequences v1.0.2: Extract the sequences from a list of coordinates. [Category: Conversion]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-s", "--sequences", dest="sequences", action="store", type="string", help="file that contains the sequences [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file (FASTA format) [format: output file in FASTA format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + # create parser + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + sequenceParser = FastaParser(options.sequences, options.verbosity) + + writer = FastaWriter(options.outputFileName, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Reading %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + sequence = transcript.extractSequence(sequenceParser) + writer.addSequence(sequence) + progress.inc() + progress.done() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/fastqToFasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/fastqToFasta.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,96 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Convert a FASTQ file to a FASTA file""" + +import os +import sys +from optparse import OptionParser +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from math import * + +if __name__ == "__main__": + + # parse command line + description = "FastQ to FastA v1.0.1: Convert a FastQ file into a FastA file. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in FASTQ format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + inputFile = open(options.inputFileName) + outputFastaFile = open(options.outputFileName, "w") + + inSequenceName = False + inQualityName = False + inSequence = False + inQuality = True + sequenceName = None + lineNumber = 1 + + for line in inputFile: + + if inSequenceName: + inSequence = True + inSequenceName = False + elif inQualityName: + inQuality = True + inQualityName = False + elif inSequence: + inQualityName = True + inSequence = False + elif inQuality: + inSequenceName = True + inQuality = False + else: + sys.exit("Error! Do not in which section I am (line is %d)" % (lineNumber)) + + line = line.strip() + if inSequenceName: + if line[0] != "@": + sys.exit("Error! Sequence name '%s' does not start with '@' (line is %d)" % (line, lineNumber)) + sequenceName = line[1:] + outputFastaFile.write(">%s\n" % (sequenceName)) + elif inQualityName: + if line[0] != "+": + sys.exit("Error! Quality name '%s' does not start with '+' (line is %d)" % (line, lineNumber)) + if len(line) > 1 and sequenceName != line[1:]: + sys.exit("Names in sequence and qual are different (%s, %s) (line is %d)" % (sequenceName, line[1:], lineNumber)) + elif inSequence: + outputFastaFile.write("%s\n" % (line)) + elif inQuality: + pass + lineNumber += 1 + + inputFile.close() + outputFastaFile.close() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/findTss.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/findTss.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,77 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Find TSS from short reads""" +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.Gff3Writer import Gff3Writer + +if __name__ == "__main__": + + # parse command line + description = "Find TSS v1.0.1: Find the transcription start site of a list of transcripts. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=10, type="int", help="distance between two reads to mark the same TSS [format: int] [default: 10]") + parser.add_option("-e", "--colinear", dest="colinear", action="store_true", default=False, help="group by strand [format: bool] [default: false]") + parser.add_option("-c", "--csv", dest="csv", action="store", default=None, type="string", help="output a CSV file in the given path [format: output file in Excel format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + transcriptContainer = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + transcriptListComparator = TranscriptListsComparator(None, options.verbosity) + transcriptListComparator.restrictToStart(transcriptListComparator.QUERY, 1) + transcriptListComparator.setMaxDistance(options.distance) + transcriptListComparator.aggregate(True) + transcriptListComparator.computeOdds(True) + transcriptListComparator.getColinearOnly(options.colinear) + transcriptListComparator.setNormalization(options.normalize) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer) + transcriptListComparator.setOutputWriter(Gff3Writer(options.output, options.verbosity)) + transcriptListComparator.compareTranscriptListSelfMerge() + + if options.csv != None: + csvResults = transcriptListComparator.getOddsPerTranscript() + csvFile = open(options.csv, "w") + csvFile.write("Number,Transcript\n") + for number in sorted(list(set(csvResults.values()))): + csvFile.write("%d," % (number)) + for name in csvResults: + if csvResults[name] == number: + csvFile.write("%s " % (name)) + csvFile.write("\n") + csvFile.close() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/fold.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/fold.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,95 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Read a mapping file (many formats supported) and select some of them +Mappings should be sorted by read names +""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.toolLauncher.RnaFoldLauncher import RnaFoldLauncher +from commons.core.writer.Gff3Writer import Gff3Writer + + +class Fold(object): + """ + Fold a series of transcripts + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.rnaFoldLauncher = RnaFoldLauncher(verbosity) + self.gff3Writer = None + + + def setInputFileName(self, fileName, format): + transcriptContainer = TranscriptContainer(fileName, format, options.verbosity) + self.rnaFoldLauncher.setTranscriptList(transcriptContainer) + + + def setOutputFileName(self, fileName): + self.gff3Writer = Gff3Writer("%s.gff3" % (fileName), self.verbosity) + + + def setGenomeFileName(self, fileName): + self.rnaFoldLauncher.setGenomeFile(fileName) + + + def setExtensions(self, fivePrime, threePrime): + self.rnaFoldLauncher.setExtensions(fivePrime, threePrime) + + + def start(self): + self.gff3Writer.addTranscriptList(self.rnaFoldLauncher.getResults()) + + + +if __name__ == "__main__": + + # parse command line + description = "Fold v1.0.1: Fold a list of transcript and give the energy. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-g", "--genome", dest="genomeFileName", action="store", type="string", help="genome file name [format: file in FASTA format]") + parser.add_option("-5", "--fivePrime", dest="fivePrime", action="store", type="int", help="extend towards the 5' end [format: int]") + parser.add_option("-3", "--threePrime", dest="threePrime", action="store", type="int", help="extend towards the 3' end [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + folder = Fold(options.verbosity) + folder.setInputFileName(options.inputFileName, options.format) + folder.setOutputFileName(options.outputFileName) + folder.setExtensions(options.fivePrime, options.threePrime) + folder.setGenomeFileName(options.genomeFileName) + folder.start() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getDifference.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getDifference.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,155 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Restrict a transcript list with some parameters (regions)""" + +from optparse import OptionParser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.misc.Progress import Progress + +class DifferenceGetter(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.annotationParser = None + self.referenceParser = None + self.sequenceParser = None + self.transcriptCount = 1 + self.split = False + + def createTranscript(self, chromosome, start, end): + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setDirection("+") + transcript.setStart(start) + transcript.setEnd(end) + transcript.setName("region_%d" % self.transcriptCount) + transcript.setTagValue("ID", "region_%d" % self.transcriptCount) + self.transcriptCount += 1 + return transcript + + def setSplit(self, split): + self.split = split + + def setAnnotationFile(self, fileName, format): + if fileName != None: + self.annotationParser = TranscriptContainer(fileName, format, self.verbosity) + + def setReferenceFile(self, fileName, format): + if fileName != None: + self.referenceParser = TranscriptContainer(fileName, format, self.verbosity) + + def setSequenceFile(self, fileName): + if fileName != None: + self.sequenceParser = FastaParser(fileName, self.verbosity) + + def setOutputFile(self, fileName): + self.writer = Gff3Writer(fileName, self.verbosity) + + def initialize(self): + self.presence = {} + for chromosome in self.sequenceParser.getRegions(): + self.presence[chromosome] = [[1, self.sequenceParser.getSizeOfRegion(chromosome)]] + + def readTranscripts(self): + nbTranscripts = self.annotationParser.getNbTranscripts() + progress = Progress(nbTranscripts, "Parsing annotation file" , self.verbosity) + for transcript in self.annotationParser.getIterator(): + chromosome = transcript.getChromosome() + toBeDeleted = [] + toBeAppended = [] + for i, element in enumerate(self.presence[chromosome]): + start, end = element + if start <= transcript.getEnd() and transcript.getStart() <= end: + toBeDeleted.append(i) + if start < transcript.getStart(): + toBeAppended.append([start, transcript.getStart() - 1]) + if end > transcript.getEnd(): + toBeAppended.append([transcript.getEnd() + 1, end]) + for i in reversed(toBeDeleted): + del self.presence[chromosome][i] + self.presence[chromosome].extend(toBeAppended) + progress.inc() + progress.done() + + def writeOutput(self): + for chromosome in self.presence: + for element in self.presence[chromosome]: + start, end = element + self.writer.addTranscript(self.createTranscript(chromosome, start, end)) + self.writer.write() + + def compareToSequence(self): + self.initialize() + self.readTranscripts() + self.writeOutput() + + def compareToAnnotation(self): + transcriptListComparator = TranscriptListsComparator(None, self.verbosity) + transcriptListComparator.setSplitDifference(self.split) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, self.annotationParser) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.REFERENCE, self.referenceParser) + transcriptListComparator.setOutputWriter(self.writer) + transcriptListComparator.getDifferenceTranscriptList() + + def run(self): + if self.referenceParser != None: + self.compareToAnnotation() + else: + self.compareToSequence() + + +if __name__ == "__main__": + + # parse command line + description = "Get Difference v1.0.1: Get all the regions of the genome, except the one given or get all the elements from the first set which does not ovelap with the second set (at the nucleotide level). [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", default=None, type="string", help="reference file [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", default=None, type="string", help="format of the reference file [format: transcript file format]") + parser.add_option("-s", "--sequence", dest="sequenceFileName", action="store", default=None, type="string", help="sequence file [format: file in FASTA format]") + parser.add_option("-p", "--split", dest="split", action="store_true", default=False, help="when comparing to a set of genomic coordinates, do not join [format: boolean] [default: False") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + getter = DifferenceGetter(options.verbosity) + getter.setSplit(options.split) + getter.setAnnotationFile(options.inputFileName1, options.format1) + getter.setSequenceFile(options.sequenceFileName) + getter.setReferenceFile(options.inputFileName2, options.format2) + getter.setOutputFile(options.outputFileName) + getter.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getDistance.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getDistance.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,241 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the distance between the transcripts of two lists""" + +import os +import sys +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from commons.core.writer.Gff3Writer import Gff3Writer + +class GetDistance(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.writer = None + self.spearman = False + self.tlc = TranscriptListsComparator(None, self.verbosity) + self.strands = (0, ) + self.buckets = None + self.title = "" + self.xMin = None + self.xMax = None + self.proportion = False + self.outputFileName = None + self.keep = False + + def __del__(self): + pass + + def setQueryFile(self, fileName, format): + self.transcriptContainer1 = TranscriptContainer(fileName, format, self.verbosity) + + def setReferenceFile(self, fileName, format): + self.transcriptContainer2 = TranscriptContainer(fileName, format, self.verbosity) + + def setOutputFile(self, fileName): + self.outputFileName = fileName + + def setOutputTranscriptFile(self, fileName): + if fileName != None: + self.writer = Gff3Writer(fileName, self.verbosity) + + def restrictQueryToStart(self, number): + self.tlc.restrictToStart(self.tlc.QUERY, number) + + def restrictReferenceToStart(self, number): + self.tlc.restrictToStart(self.tlc.REFERENCE, number) + + def restrictQueryToEnd(self, number): + self.tlc.restrictToEnd(self.tlc.QUERY, number) + + def restrictReferenceToEnd(self, number): + self.tlc.restrictToEnd(self.tlc.REFERENCE, number) + + def setAbsolute(self, boolean): + self.tlc.setAbsolute(boolean) + + def setProportion(self, boolean): + self.proportion = boolean + + def setColinear(self, boolean): + self.tlc.getColinearOnly(boolean) + + def setAntisense(self, boolean): + self.tlc.getAntisenseOnly(boolean) + + def setDistances(self, minDistance, maxDistance): + self.tlc.setMinDistance(minDistance) + self.tlc.setMaxDistance(maxDistance) + + def setStrands(self, boolean): + self.tlc.setStrandedDistance(boolean) + if boolean: + self.strands = (-1, 1) + + def setUpstream(self, number): + self.tlc.setUpstream(self.tlc.REFERENCE, number) + + def setDownstream(self, number): + self.tlc.setDownstream(self.tlc.REFERENCE, number) + + def setBuckets(self, number): + self.buckets = number + + def setTitle(self, title): + self.title = title + + def setXValues(self, xMin, xMax): + self.xMin, self.xMax = xMin, xMax + + def keepTmpValues(self, boolean): + self.keep = boolean + + def getSpearman(self, boolean): + self.spearman = True + + def compare(self): + self.tlc.setInputTranscriptContainer(self.tlc.QUERY, self.transcriptContainer1) + self.tlc.setInputTranscriptContainer(self.tlc.REFERENCE, self.transcriptContainer2) + self.tlc.setOutputWriter(self.writer) + self.distances = self.tlc.compareTranscriptListDistance() + + def checkEmptyDistances(self): + return (sum([len(self.distances[strand].keys()) for strand in self.strands]) == 0) + + def setPlotterMinusStrand(self): + if -1 in self.strands: + for x, y in self.distances[-1].iteritems(): + self.distances[-1][x] = -y + + def setPlotterProportion(self): + if not self.proportion: + return + self.nbElements = sum([abs(sum(self.distances[strand].values())) for strand in self.strands]) + for strand in self.strands: + self.distances[strand] = dict([(distance, float(nb) / self.nbElements * 100) for distance, nb in self.distances[strand].iteritems()]) + + def setPlotter(self): + self.plotter = RPlotter(self.outputFileName, self.verbosity, self.keep) + if self.buckets != None: + self.plotter.setBarplot(True) + self.plotter.setFill(0) + self.plotter.setXLabel("distance") + self.plotter.setYLabel("# elements") + if self.proportion: + self.plotter.setYLabel("%% elements (%d in toto)" % (self.nbElements)) + self.plotter.setBuckets(self.buckets) + self.plotter.setMinimumX(self.xMin) + self.plotter.setMaximumX(self.xMax) + self.plotter.setTitle(self.title) + + def plot(self): + if len(self.strands) == 1: + self.distances = {0: self.distances} + if self.checkEmptyDistances(): + print "No output." + sys.exit() + self.setPlotterMinusStrand() + self.setPlotterProportion() + if self.outputFileName == None: + return + self.setPlotter() + for strand in self.strands: + self.plotter.addLine(self.distances[strand]) + self.plotter.plot() + + def printSpearman(self): + if self.spearman: + print "Spearman's rho: %.5f" % (self.plotter.getSpearmanRho()) + + def run(self): + self.compare() + self.plot() + self.printSpearman() + +if __name__ == "__main__": + + # parse command line + description = "Get Distance v1.0.3: Compute the distance of a set of transcript with respect to a reference set. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="plot output file [format: output file in PNG format]") + parser.add_option("-O", "--outputDistances", dest="outputDistances", action="store", default=None, type="string", help="output file containing the distance for each element of the query [format: output file in GFF3 format] [default: None]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="only consider features on the same strand [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="only consider features on the opposite strand [format: bool] [default: false]") + parser.add_option("-b", "--absolute", dest="absolute", action="store_true", default=False, help="give the absolute value of the distance [format: bool] [default: false]") + parser.add_option("-p", "--proportion", dest="proportion", action="store_true", default=False, help="give the proportion on the y-axis instead of the number of distances [format: bool] [default: false]") + parser.add_option("-s", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first 5' nucleotides for list 1 [format: int]") + parser.add_option("-S", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first 5' nucleotides for list 2 [format: int]") + parser.add_option("-e", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last 3' nucleotides for list 1 [format: int]") + parser.add_option("-E", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last 3' nucleotides for list 2 [format: int]") + parser.add_option("-m", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance considered between two transcripts [format: int] [default: None]") + parser.add_option("-M", "--maxDistance", dest="maxDistance", action="store", default=1000, type="int", help="maximum distance considered between two transcripts [format: int] [default: 1000]") + parser.add_option("-5", "--fivePrime", dest="fivePrime", action="store_true", default=False, help="consider the elements from list 1 which are upstream of elements of list 2 [format: bool] [default: False]") + parser.add_option("-3", "--threePrime", dest="threePrime", action="store_true", default=False, help="consider the elements from list 1 which are downstream of elements of list 2 [format: bool] [default: False]") + parser.add_option("-u", "--buckets", dest="buckets", action="store", default=None, type="int", help="plot histogram instead of line plot with given interval size [format: int] [default: None]") + parser.add_option("-2", "--2strands", dest="twoStrands", action="store_true", default=False, help="plot the distributions of each strand separately [format: bool] [default: False]") + parser.add_option("-r", "--spearman", dest="spearman", action="store_true", default=False, help="compute Spearman rho [format: bool] [default: False]") + parser.add_option("-x", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int] [default: None]") + parser.add_option("-X", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int] [default: None]") + parser.add_option("-t", "--title", dest="title", action="store", default=None, type="string", help="title for the graph [format: int] [default: None]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-k", "--keep", dest="keep", action="store_true", default=False, help="keep temporary files [format: bool]") + (options, args) = parser.parse_args() + + gd = GetDistance(options.verbosity) + gd.setQueryFile(options.inputFileName1, options.format1) + gd.setReferenceFile(options.inputFileName2, options.format2) + gd.setOutputFile(options.outputFileName) + gd.setOutputTranscriptFile(options.outputDistances) + gd.setColinear(options.colinear) + gd.setAntisense(options.antisense) + gd.setAbsolute(options.absolute) + gd.setProportion(options.proportion) + gd.restrictQueryToStart(options.start1) + gd.restrictReferenceToStart(options.start2) + gd.restrictQueryToEnd(options.end1) + gd.restrictReferenceToEnd(options.end2) + gd.setDistances(options.minDistance, options.maxDistance) + gd.setUpstream(options.fivePrime) + gd.setDownstream(options.threePrime) + gd.setStrands(options.twoStrands) + gd.setBuckets(options.buckets) + gd.setTitle(options.title) + gd.setXValues(options.xMin, options.xMax) + gd.keepTmpValues(options.keep) + gd.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getDistribution.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,291 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the repartition of some elements in a chromosomes""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from math import * + +def divideKeyDict(dictionary, ratio): + return dict([(key / ratio, dictionary[key]) for key in dictionary]) + + +def setTranscript(chromosome, direction, start, end, name, value): + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setDirection(direction) + transcript.setStart(start) + transcript.setEnd(end) + transcript.setName(name) + transcript.setTagValue("nbElements", value) + return transcript + + + +if __name__ == "__main__": + + magnifyingFactor = 1000 + + # parse command line + description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [compulsory] [format: file in FASTA format]") + parser.add_option("-n", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-w", "--raw", dest="raw", action="store_true", default=False, help="plot raw number of occurrences instead of density [format: bool] [default: false]") + parser.add_option("-x", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-g", "--gff", dest="gff", action="store_true", default=False, help="also write GFF3 file [format: bool] [default: false]") + parser.add_option("-H", "--height", dest="height", action="store", default=None, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=None, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + (options, args) = parser.parse_args() + + sizes = {} + if options.referenceFileName != None: + # get the sizes of the chromosomes + referenceHandle = open(options.referenceFileName) + name = None + size = 0 + maxSize = 0 + for line in referenceHandle: + line = line.strip() + if line == "": continue + if line[0] == ">": + if name != None: + if options.verbosity > 10: + print name + sizes[name] = size + maxSize = max(maxSize, size) + size = 0 + name = line[1:] + else: + size += len(line) + sizes[name] = size + maxSize = max(maxSize, size) + if options.verbosity > 1: + print "done" + start = 0 + end = maxSize + else: + if options.chromosome == None or options.start == None or options.end == None: + raise Exception("Missing chromosome or start and end positions, or reference file") + maxSize = options.end + sizes[options.chromosome] = options.end + start = options.start + end = options.end + + + tmp1 = int(maxSize / float(options.nbBins)) + tmp2 = 10 ** (len("%d" % (tmp1))-2) + sliceSize = int((tmp1 / tmp2) * tmp2) + + bins = dict() + binsPlus = dict() + binsMinus = dict() + for chromosome in sizes: + bins[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) + binsPlus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) + binsMinus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + maxSlice = 0 + # count the number of reads + for transcript in parser.getIterator(): + if options.chromosome == None or (transcript.getChromosome() == options.chromosome and transcript.getStart() >= start and transcript.getStart() <= end): + if transcript.getDirection() == 1: + binsPlus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 + else: + binsMinus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 + bins[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 + maxSlice = max(maxSlice, transcript.getStart() / sliceSize) + progress.inc() + progress.done() + + # compute densities + densityPlus = dict() + for chromosome in bins: + densityPlus[chromosome] = dict([(bin, 0) for bin in binsPlus[chromosome]]) + for bin in binsPlus[chromosome]: + densityPlus[chromosome][bin] = float(binsPlus[chromosome][bin]) / sliceSize * magnifyingFactor + # correct densities for first and last bins + if start % sliceSize != 0: + densityPlus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor + if sizes[chromosome] % sliceSize != 0: + densityPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor + densityMinus = dict() + for chromosome in binsMinus: + densityMinus[chromosome] = dict([(bin, 0) for bin in binsMinus[chromosome]]) + for bin in binsMinus[chromosome]: + densityMinus[chromosome][bin] = float(binsMinus[chromosome][bin]) / sliceSize * magnifyingFactor + # correct densities for first and last bins + if start % sliceSize != 0: + densityMinus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor + if sizes[chromosome] % sliceSize != 0: + densityMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor + density = dict() + for chromosome in bins: + density[chromosome] = dict([(bin, 0) for bin in bins[chromosome]]) + for bin in bins[chromosome]: + density[chromosome][bin] = densityPlus[chromosome][bin] + densityMinus[chromosome][bin] + + for chromosome in densityMinus: + for bin in densityMinus[chromosome]: + densityMinus[chromosome][bin] *= -1 + for bin in binsMinus[chromosome]: + binsMinus[chromosome][bin] *= -1 + + for chromosome in density: + maxX = max(bins[chromosome].keys()) + if maxX <= 1000: + unit = "nt." + ratio = 1.0 + elif maxX <= 1000000: + unit = "kb" + ratio = 1000.0 + else: + unit = "Mb" + ratio = 1000000.0 + outputFileName = "%s_%s" % (options.outputFileName, chromosome) + if options.start != None and options.end != None: + outputFileName += ":%d-%d" % (options.start, options.end) + outputFileName += ".png" + plotter = RPlotter(outputFileName, options.verbosity) + plotter.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) + plotter.setYLabel("# reads") + if options.bothStrands: + plotter.setImageSize(1000, 300) + else: + plotter.setImageSize(1000, 200) + if options.height != None: + plotter.setHeight(options.height) + if options.width != None: + plotter.setWidth(options.width) + if options.yMax != None: + plotter.setMinimumY(options.yMin) + if options.yMax != None: + plotter.setMaximumY(options.yMax) + if options.bothStrands : + if options.raw: + plotter.addLine(divideKeyDict(binsPlus[chromosome], ratio)) + else: + plotter.addLine(divideKeyDict(densityPlus[chromosome], ratio)) + if options.raw: + plotter.addLine(divideKeyDict(binsMinus[chromosome], ratio)) + else: + plotter.addLine(divideKeyDict(densityMinus[chromosome], ratio)) + else: + if options.raw: + plotter.addLine(divideKeyDict(bins[chromosome], ratio)) + else: + plotter.addLine(divideKeyDict(density[chromosome], ratio)) + plotter.plot() + + if options.csv: + outputFileName = "%s" % (options.outputFileName) + if options.chromosome != None: + outputFileName += "_%s" % (options.chromosome) + if options.start != None and options.end != None: + outputFileName += ":%d-%d" % (options.start, options.end) + outputFileName += ".csv" + csvHandle = open(outputFileName, "w") + for slice in range(start / sliceSize, maxSlice + 1): + csvHandle.write(";%d-%d" % (slice * sliceSize + 1, (slice+1) * sliceSize)) + csvHandle.write("\n") + if options.bothStrands: + for chromosome in densityPlus: + if len(densityPlus[chromosome]) > 0: + csvHandle.write("%s [+]" % (chromosome)) + for slice in sorted(densityPlus[chromosome].keys()): + csvHandle.write(";%.2f" % (densityPlus[chromosome][slice])) + csvHandle.write("\n") + if len(densityMinus[chromosome]) > 0: + csvHandle.write("%s [-]" % (chromosome)) + for slice in sorted(densityPlus[chromosome].keys()): + csvHandle.write(";%.2f" % (-densityMinus[chromosome][slice])) + csvHandle.write("\n") + else: + for chromosome in density: + if len(density[chromosome]) > 0: + csvHandle.write(chromosome) + for slice in sorted(density[chromosome].keys()): + csvHandle.write(";%.2f" % (density[chromosome][slice])) + csvHandle.write("\n") + csvHandle.close() + + if options.gff: + chromosome = "" if options.chromosome == None else options.chromosome.capitalize() + start = "" if options.start == None else "%d" % (options.start) + end = "" if options.end == None else "%d" % (options.end) + link1 = "" if options.start == None and options.end == None else ":" + link2 = "" if options.start == None and options.end == None else "-" + writer = Gff3Writer("%s%s%s%s%s.gff3" % (options.outputFileName, link1, start, link2, end), options.verbosity) + cpt = 1 + if options.raw: + valuesPlus = binsPlus + valuesMinus = binsMinus + values = bins + else: + valuesPlus = densityPlus + valuesMinus = densityMinus + values = density + if options.bothStrands: + for chromosome in values: + for slice in valuesPlus[chromosome]: + writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), valuesPlus[chromosome][slice])) + cpt += 1 + for slice in valuesMinus[chromosome]: + writer.addTranscript(setTranscript(chromosome, -1, slice, slice + sliceSize, "region%d" % (cpt), - valuesMinus[chromosome][slice])) + cpt += 1 + else: + for chromosome in values: + for slice in values[chromosome]: + writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), values[chromosome][slice])) + cpt += 1 + writer.write() + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getElement.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getElement.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,106 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the first element (exon / intron) from a list of transcripts""" + +import os +from optparse import OptionParser +from commons.core.writer.Gff3Writer import * +from SMART.Java.Python.structure.TranscriptContainer import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + description = "Get Element v1.0.1: Get the first element (exon / intron) from a list of transcripts. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="mySQL output [format: bool] [default: false]") + parser.add_option("-t", "--type", dest="type", action="store", type="string", help="type of the element [format: choice (exon, intron)]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + writer = Gff3Writer(options.outputFileName, options.verbosity) + sqlWriter = MySqlTranscriptWriter(options.outputFileName, options.verbosity) + + nbLines = parser.getNbTranscripts() + print "%i lines found" % (nbLines) + + # treat transcripts + nbWritten = 0 + nbUsed = 0 + progress = Progress(nbLines, "Analyzing transcripts of " + options.inputFileName, options.verbosity) + for transcript in parser.getIterator(): + + outTranscript = Transcript() + outTranscript.setName(transcript.getName()) + outTranscript.setDirection(transcript.getDirection()) + outTranscript.setChromosome(transcript.getChromosome()) + + if options.type == "exon": + if len(transcript.getExons()) > 1: + transcript.sortExons() + outTranscript.setStart(transcript.getExons()[0].getStart()) + outTranscript.setEnd(transcript.getExons()[0].getEnd()) + writer.addTranscript(outTranscript) + if options.mysql: + sqlWriter.addTranscript(transcript) + nbWritten += 1 + nbUsed += 1 + elif options.type == "intron": + used = False + for intron in transcript.getIntrons(): + used = True + thisTranscript = Transcript() + thisTranscript.copy(outTranscript) + thisTranscript.setStart(intron.getStart()) + thisTranscript.setEnd(intron.getEnd()) + writer.addTranscript(thisTranscript) + if options.mysql: + sqlWriter.addTranscript(transcript) + nbWritten += 1 + if used: + nbUsed += 1 + else: + sys.exit("Cannot understan type %s" % (options.type)) + progress.inc() + progress.done() + + if options.mysql: + sqlWriter.write() + + print "nb sequences used: %d" % (nbUsed) + print "nb elements used: %d" % (nbWritten) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getExons.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getExons.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,128 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +zeroBaseToOneBaseConvertor = (lambda x: x - 1 if x > 0 else x) + +class GetExons(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.selection = False + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setSelection(self, selection): + if selection == None: + return + self.selection = True + self.selectionItems = [] + self.selectionIntervals = [] + for part in selection.split(","): + try: + splittedPart = map(int, part.split("..")) + except Exception: + raise Exception("Elements '" + splittedPart + "' of selection '" + selection + "' do no seem to be integers!") + if len(splittedPart) == 1: + self.selectionItems.append(splittedPart[0]) + elif len(splittedPart) == 2: + self.selectionIntervals.append((splittedPart[0], splittedPart[1])) + else: + raise Exception("Cannot parse elements '" + splittedPart + "' of selection '" + selection + "'!") + + def getSelectionExonIndices(self, nbExons): + if not self.selection: + return range(nbExons) + indices = [] + for item in self.selectionItems: + indices.append(range(nbExons)[zeroBaseToOneBaseConvertor(item)]) + for start, end in self.selectionIntervals: + start, end = map(zeroBaseToOneBaseConvertor, (start, end)) + if end > 0: + end += 1 + indices.extend(range(nbExons)[start:end]) + return indices + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def run(self): + progress = Progress(self.parser.getNbTranscripts(), "Reading input file", self.verbosity) + nbExons = 0 + for cpt1, transcript in enumerate(self.parser.getIterator()): + selectedExons = self.getSelectionExonIndices(transcript.getNbExons()) + transcript.sortExons() + for cpt2, exon in enumerate(transcript.getExons()): + if cpt2 not in selectedExons: + continue + exonTranscript = Transcript() + exonTranscript.copy(exon) + if "Parent" in exonTranscript.tags: + del exonTranscript.tags["Parent"] + exonTranscript.tags["feature"] = "transcript" + if "ID" not in exonTranscript.tags or exonTranscript.tags["ID"] == "unnamed transcript": + exonTranscript.tags["ID"] = "exon_%d-%d" % (cpt1+1, cpt2+1) + if exonTranscript.getName() == "unnamed transcript": + exonTranscript.setName("exon_%d-%d" % (cpt1+1, cpt2+1)) + self.writer.addTranscript(exonTranscript) + nbExons += 1 + progress.inc() + self.writer.write() + self.writer.close() + progress.done() + if self.verbosity > 1: + print "%d transcripts read" % (self.parser.getNbTranscripts()) + print "%d exons written" % (nbExons) + +if __name__ == "__main__": + + description = "Get Exons v1.0.1: Get the exons of a set of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-s", "--select", dest="select", action="store", default=None, type="string", help="select some of the exons (like '1,2,5..-3,-1') [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + ge = GetExons(options.verbosity) + ge.setInputFile(options.inputFileName, options.format) + ge.setSelection(options.select) + ge.setOutputFile(options.outputFileName) + ge.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getInfoPerCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getInfoPerCoverage.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,167 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Compare overlap of a transcript list and list of read, and get some info depending on the coverage""" + +import os +from optparse import OptionParser +from commons.core.parsing.SequenceListParser import * +from commons.core.writer.Gff3Writer import * +from SMART.Java.Python.mySql.MySqlConnection import * +from SMART.Java.Python.structure.TranscriptListsComparator import * +from SMART.Java.Python.misc.RPlotter import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + description = "Get Info per Coverage v1.0.1: Get a list of information clustered by the density of the coverage on a genome. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in TXT format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store", default=None, type="string", help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + logHandle = None + if options.log != None: + logHandle = open(options.log, "w") + + transcriptContainer1 = TranscriptContainer(options.inputFileName1, options.format1, options.verbosity) + transcriptContainer2 = TranscriptContainer(options.inputFileName2, options.format2, options.verbosity) + + transcriptListComparator = TranscriptListsComparator(logHandle, options.verbosity) + transcriptListComparator.restrictToStart(transcriptListComparator.REFERENCE, 10) + transcriptListComparator.getColinearOnly(True) + transcriptListComparator.computeOddsPerTranscript(True) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer1) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.REFERENCE, transcriptContainer2) + transcriptListComparator.compareTranscriptList() + transcriptTables = transcriptListComparator.getOutputTables() + + sizesWithIntrons = {} + sizesWithoutIntrons = {} + nbExons = {} + averageSizesWithIntrons = {} + averageSizesWithoutIntrons = {} + averageNbExons = {} + sumSizesWithIntrons = {} + sumSizesWithoutIntrons = {} + sumSizesNbExons = {} + coverages = transcriptListComparator.getOddsPerTranscript() + + progress = Progress(transcriptContainer2.getNbTranscripts(), "Reading transcript file again", options.verbosity) + for transcript in transcriptContainer2.getIterator(): + if transcript.name in coverages: + if transcript.getSizeWithIntrons() not in averageSizesWithIntrons: + averageSizesWithIntrons[transcript.getSizeWithIntrons()] = coverages[transcript.name] + else: + averageSizesWithIntrons[transcript.getSizeWithIntrons()] += coverages[transcript.name] + if transcript.getSizeWithIntrons() not in sumSizesWithIntrons: + sumSizesWithIntrons[transcript.getSizeWithIntrons()] = 1 + else: + sumSizesWithIntrons[transcript.getSizeWithIntrons()] += 1 + if transcript.getSize() not in averageSizesWithoutIntrons: + averageSizesWithoutIntrons[transcript.getSize()] = coverages[transcript.name] + else: + averageSizesWithoutIntrons[transcript.getSize()] += coverages[transcript.name] + if transcript.getSize() not in sumSizesWithoutIntrons: + sumSizesWithoutIntrons[transcript.getSize()] = 1 + else: + sumSizesWithoutIntrons[transcript.getSize()] += 1 + if transcript.getNbExons() not in averageNbExons: + averageNbExons[transcript.getNbExons()] = coverages[transcript.name] + else: + averageNbExons[transcript.getNbExons()] += coverages[transcript.name] + if transcript.getNbExons() not in sumSizesNbExons: + sumSizesNbExons[transcript.getNbExons()] = 1 + else: + sumSizesNbExons[transcript.getNbExons()] += 1 + sizesWithIntrons[transcript.name] = (transcript.getSizeWithIntrons(), coverages[transcript.name]) + sizesWithoutIntrons[transcript.name] = (transcript.getSize(), coverages[transcript.name]) + nbExons[transcript.name] = (transcript.getNbExons(), coverages[transcript.name]) + progress.inc() + progress.done() + + plotterSizeWithIntrons = RPlotter("%sWithIntrons.png" % (options.output), options.verbosity) + plotterSizeWithIntrons.setPoints(True) + plotterSizeWithIntrons.setMaximumX(10000) + plotterSizeWithIntrons.setMaximumY(1000) + plotterSizeWithIntrons.setLog("y") + plotterSizeWithIntrons.addLine(sizesWithIntrons) + plotterSizeWithIntrons.plot() + + plotterSizeWithoutIntrons = RPlotter("%sWithoutIntrons.png" % (options.output), options.verbosity) + plotterSizeWithoutIntrons.setPoints(True) + plotterSizeWithoutIntrons.setMaximumX(10000) + plotterSizeWithoutIntrons.setMaximumY(1000) + plotterSizeWithoutIntrons.setLog("y") + plotterSizeWithoutIntrons.addLine(sizesWithoutIntrons) + plotterSizeWithoutIntrons.plot() + + plotterNbExons = RPlotter("%sNbExons.png" % (options.output), options.verbosity) + plotterNbExons.setPoints(True) + plotterNbExons.addLine(nbExons) + plotterNbExons.plot() + + for element in averageSizesWithIntrons: + averageSizesWithIntrons[element] = int(float(averageSizesWithIntrons[element]) / sumSizesWithIntrons[element]) + plotterAverageSizeWithIntrons = RPlotter("%sAverageWithIntrons.png" % (options.output), options.verbosity) + plotterAverageSizeWithIntrons.setMaximumX(10000) + plotterAverageSizeWithIntrons.setMaximumY(1000) + plotterAverageSizeWithIntrons.setLog("y") + plotterAverageSizeWithIntrons.addLine(averageSizesWithIntrons) + plotterAverageSizeWithIntrons.plot() + print "min/avg/med/max sizes with introns: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(averageSizesWithIntrons) + + for element in averageSizesWithoutIntrons: + averageSizesWithoutIntrons[element] = int(float(averageSizesWithoutIntrons[element]) / sumSizesWithoutIntrons[element]) + plotterAverageSizeWithoutIntrons = RPlotter("%sAverageWithoutIntrons.png" % (options.output), options.verbosity) + plotterAverageSizeWithoutIntrons.setMaximumX(10000) + plotterAverageSizeWithoutIntrons.setMaximumY(1000) + plotterAverageSizeWithoutIntrons.setLog("y") + plotterAverageSizeWithoutIntrons.addLine(averageSizesWithoutIntrons) + plotterAverageSizeWithoutIntrons.plot() + print "min/avg/med/max sizes without introns: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(averageSizesWithoutIntrons) + + for element in averageNbExons: + averageNbExons[element] = int(float(averageNbExons[element]) / sumSizesNbExons[element]) + plotterAverageNbExons = RPlotter("%sAverageNbExons.png" % (options.output), options.verbosity) + plotterAverageNbExons.addLine(averageNbExons) + plotterAverageNbExons.plot() + print "min/avg/med/max # exons: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(averageNbExons) + + if options.log: + logHandle.close() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getIntrons.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getIntrons.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,89 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +class GetIntrons(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def run(self): + progress = Progress(self.parser.getNbTranscripts(), "Reading input file", self.verbosity) + nbIntrons = 0 + for cpt1, transcript in enumerate(self.parser.getIterator()): + for cpt2, intron in enumerate(transcript.getIntrons()): + intronTranscript = Transcript() + intronTranscript.copy(intron) + if "Parent" in intronTranscript.tags: + del intronTranscript.tags["Parent"] + intronTranscript.tags["feature"] = "transcript" + if "ID" not in intronTranscript.tags or intronTranscript.tags["ID"] == "unnamed transcript": + intronTranscript.tags["ID"] = "intron_%d-%d" % (cpt1+1, cpt2+1) + if intronTranscript.getName() == "unnamed transcript": + intronTranscript.setName("intron_%d-%d" % (cpt1+1, cpt2+1)) + self.writer.addTranscript(intronTranscript) + nbIntrons += 1 + progress.inc() + self.writer.write() + self.writer.close() + progress.done() + if self.verbosity > 1: + print "%d transcripts read" % (self.parser.getNbTranscripts()) + print "%d introns written" % (nbIntrons) + + +if __name__ == "__main__": + + description = "Get Introns v1.0.1: Get the introns of a set of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + gi = GetIntrons(options.verbosity) + gi.setInputFile(options.inputFileName, options.format) + gi.setOutputFile(options.outputFileName) + gi.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getLetterDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getLetterDistribution.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,153 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the size distribution of a Fasta / BED file""" + +import os +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * +from commons.core.parsing.ParserChooser import ParserChooser + + +def writeCVSfile(outHandler): + for pos in range(len(letters)): + posTrue = pos +1 + outHandler.write( "%s;" % (posTrue)) + for letter in lettersRate: + if positionRate[letter].has_key(pos): + outHandler.write("%s=%.2f%s;" %(letter, positionRate[letter][pos], "%")) + else: + outHandler.write("%s=0%s;" % (letter, "%")) + outHandler.write("\n") + +if __name__ == "__main__": + + # parse command line + description = "Get Letter Distribution v1.0.1: Compute the distribution of nucleotides of a set of genomic coordinates. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file to be analyzed [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: sequence file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-c", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool] [default: false]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + chooser = ParserChooser() + chooser.findFormat(options.format) + parser = chooser.getParser(options.inputFileName) + nbSequences = parser.getNbSequences() + print "%i sequences read" % (nbSequences) + + # treat items + progress = Progress(nbSequences, "Analyzing sequences of " + options.inputFileName, options.verbosity) + nbLettersTotal = 0 + nbLetters = {} + lettersRate = {} + nbPositions = {} + positionCount = {} + positionRate = {} + nbPositionRate = {} + for sequence in parser.getIterator(): + letters = sequence.getSequence() + thisNbLettersTotal = sequence.getSize() + nbLettersTotal += thisNbLettersTotal + thisNbLetters = {} + + for pos in range(len(letters)): + letter = letters[pos] + if letter not in thisNbLetters: + thisNbLetters[letter] = 1 + else: + thisNbLetters[letter] += 1 + if pos+1 not in nbPositions: + nbPositions[pos+1] = 1 + else: + nbPositions[pos+1] += 1 + if letter not in positionCount: + positionCount[letter] = {} + if pos+1 not in positionCount[letter]: + positionCount[letter][pos+1] = 1 + else: + positionCount[letter][pos+1] += 1 + + for letter in thisNbLetters: + if letter not in nbLetters: + nbLetters[letter] = thisNbLetters[letter] + else: + nbLetters[letter] += thisNbLetters[letter] + if letter not in lettersRate: + lettersRate[letter] = {} + rate = int(float(thisNbLetters[letter]) / thisNbLettersTotal * 100) + if rate not in lettersRate[letter]: + lettersRate[letter][rate] = 1 + else: + lettersRate[letter][rate] += 1 + progress.inc() + progress.done() + + for letter in positionCount: + positionRate[letter] = {} + for pos in positionCount[letter]: + positionRate[letter][pos] = positionCount[letter][pos] / float(nbPositions[pos]) * 100 + for pos in nbPositions: + nbPositionRate[pos] = nbPositions[pos] / float(nbPositions[1]) * 100 + + # plot content distributions + plotter = RPlotter("%s.png" % (options.outputFileName), options.verbosity, True) + plotter.setFill(0) + plotter.setLegend(True) + for letter in lettersRate: + plotter.addLine(lettersRate[letter], letter) + plotter.plot() + + # plot distribution per position + plotter = RPlotter("%sPerNt.png" % (options.outputFileName), options.verbosity, True) + plotter.setFill(0) + plotter.setLegend(True) + plotter.setXLabel("Position on the read") + plotter.setYLabel("Percentage") + for letter in positionRate: + plotter.addLine(positionRate[letter], letter) + plotter.addLine(nbPositionRate, "#") + plotter.plot() + + if options.csv: + outHandler = open("%s.csv" % (options.outputFileName), "w") + writeCVSfile(outHandler) + outHandler.close() + + print "%d sequences" % (nbSequences) + print "%d letters" % (nbLettersTotal) + for letter in nbLetters: + print "%s: %d (%.2f%%)" % (letter, nbLetters[letter], float(nbLetters[letter]) / nbLettersTotal * 100) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getNb.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getNb.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,99 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the repartition of some elements (# exons per transcripts, # of repetitions of a mapping or # of transcripts in a cluster)""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils +from math import * + +if __name__ == "__main__": + + # parse command line + description = "Get Nb v1.0.1: Get the distribution of exons per transcripts, or mapping per read, or transcript per cluster. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in png format]") + parser.add_option("-q", "--query", dest="query", action="store", type="string", help="query [compulsory] (# exons, # transcripts) [format: choice (exon, transcript, cluster)]") + parser.add_option("-b", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]") + parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.query != "exon" and options.query != "transcript" and options.query != "cluster": + raise Exception("Do not understand query %s" % (options.query)) + + exonDistribution = {} + transcriptDistribution = {} + clusterDistribution = {} + + transcriptContainer = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + progress = Progress(transcriptContainer.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + # count the number of reads + for element in transcriptContainer.getIterator(): + if options.query == "exon": + nbExons = element.getNbExons() + exonDistribution[nbExons] = exonDistribution.get(nbExons, 0) + 1 + elif options.query == "transcript": + name = element.getName() + transcriptDistribution[name] = transcriptDistribution.get(name, 0) + 1 + elif options.query == "cluster": + nbElements = 1 if "nbElements" not in element.getTagNames() else element.getTagValue("nbElements") + clusterDistribution[nbElements] = clusterDistribution.get(nbElements, 0) + 1 + progress.inc() + progress.done() + + if options.query == "exon": + distribution = exonDistribution + elif options.query == "transcript": + distribution = {} + for name in transcriptDistribution: + distribution[transcriptDistribution[name]] = distribution.get(transcriptDistribution[name], 0) + 1 + elif options.query == "cluster": + distribution = clusterDistribution + + outputFileName = options.outputFileName + plotter = RPlotter(outputFileName, options.verbosity) + plotter.setImageSize(1000, 300) + plotter.setFill(0) + plotter.setMaximumX(options.xMax) + plotter.setBarplot(options.barplot) + plotter.addLine(distribution) + plotter.plot() + + print "min/avg/med/max: %d/%.2f/%.1f/%d" % (Utils.getMinAvgMedMax(distribution)) + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getRandomRegions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getRandomRegions.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,267 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Find random regions in a genome""" + +import random, math +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from commons.core.writer.Gff3Writer import * +from commons.core.writer.MySqlTranscriptWriter import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer + +repetitions = 100 + + +class RandomRegionsGenerator(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.strands = False + self.distribution = "uniform" + self.transcripts = None + self.sequenceParser = None + random.seed() + + + def setInput(self, fileName): + self.sequenceParser = FastaParser(fileName, self.verbosity) + + + def setGenomeSize(self, size): + self.genomeSize = size + + + def setChromosomeName(self, name): + self.chromosomeName = name + + + def setAnnotation(self, fileName, format): + parser = TranscriptContainer(fileName, format, self.verbosity) + self.transcripts = [] + for transcript in parser.getIterator(): + self.transcripts.append(transcript) + self.setNumber(len(self.transcripts)) + self.setSize(0) + + + def setOutputFile(self, fileName): + self.outputFileName = fileName + + + def setSize(self, size): + self.minSize = size + self.maxSize = size + + + def setMinSize(self, size): + self.minSize = size + + + def setMaxSize(self, size): + self.maxSize = size + + + def setNumber(self, number): + self.number = number + + + def setStrands(self, strands): + self.strands = strands + + + def setMaxDistribution(self, maxElements): + if maxElements == None: + return + self.maxElements = maxElements + self.distribution = "gaussian" + + + def setDeviationDistribution(self, deviation): + if deviation == None: + return + self.deviation = deviation + self.distribution = "gaussian" + + + def getSizes(self): + if self.sequenceParser == None: + self.chromosomes = [self.chromosomeName] + self.sizes = {self.chromosomeName: self.genomeSize} + self.cumulatedSize = self.genomeSize + self.cumulatedSizes = {self.chromosomeName: self.genomeSize} + return + self.chromosomes = self.sequenceParser.getRegions() + self.sizes = {} + self.cumulatedSize = 0 + self.cumulatedSizes = {} + for chromosome in self.chromosomes: + self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome) + self.cumulatedSize += self.sizes[chromosome] + self.cumulatedSizes[chromosome] = self.cumulatedSize + + + def findPosition(self, size = None): + if size == None: + size = random.randint(self.minSize, self.maxSize) + integer = random.randint(0, self.cumulatedSize) + for chromosome in self.chromosomes: + if self.cumulatedSizes[chromosome] > integer: + break + start = random.randint(1, self.sizes[chromosome] - size) + return (chromosome, start, size) + + + def createTranscript(self, chromosome, start, size, strand, cpt): + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setStart(start) + transcript.setEnd(start + size-1) + transcript.setDirection(strand) + transcript.setName("rand_%d" % (cpt)) + return transcript + + + def moveTranscript(self, chromosome, start, transcript): + while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]: + chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart()) + transcript.setChromosome(chromosome) + oldStart, oldEnd = transcript.getStart(), transcript.getEnd() + if transcript.getNbExons() > 1: + for exon in transcript.getNbExons(): + oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd() + exon.setStart(oldExonStart + start - oldStart) + exon.setEnd(oldExonEnd + start - oldStart) + transcript.setStart(start) + transcript.setEnd(oldEnd + start - oldStart) + return [transcript] + + + def createUniformCluster(self, chromosome, start, size, strand, cpt): + transcript = self.createTranscript(chromosome, start, size, strand, cpt) + return [transcript] + + + def findNbTranscripts(self, cpt): + return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1) + + + def getDev(self): + deviation = 0.0 + for j in range(repetitions): + deviation += random.randint(-self.deviation, self.deviation) + deviation /= repetitions + deviation = int(round(deviation)) + return deviation + + + def createGaussianCluster(self, chromosome, start, size, strand, cpt): + transcripts = [] + nbTranscripts = self.findNbTranscripts(cpt) + for i in range(nbTranscripts): + transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i) + transcripts.append(transcript) + return transcripts + + + def writeRegions(self): + writer = Gff3Writer(self.outputFileName, self.verbosity) + outputFile = open(self.outputFileName, "w") + progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity) + i = 0 + while i < self.number: + chromosome, start, size = self.findPosition() + strand = random.choice([-1, 1]) if self.strands else 1 + if self.transcripts != None: + transcripts = self.moveTranscript(chromosome, start, self.transcripts[i]) + elif self.distribution == "uniform": + transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1) + else: + transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1) + for transcript in transcripts: + writer.addTranscript(transcript) + i += 1 + progress.inc() + progress.done() + outputFile.close() + writer.write() + writer.close() + + + def run(self): + self.getSizes() + self.writeRegions() + + +if __name__ == "__main__": + + # parse command line + description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]" + + parser = OptionParser(description = description) + parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]") + parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]") + parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]") + parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]") + parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]") + parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]") + parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]") + parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + rrg = RandomRegionsGenerator(options.verbosity) + if options.reference == None: + rrg.setGenomeSize(options.referenceSize) + rrg.setChromosomeName(options.chromosome) + else: + rrg.setInput(options.reference) + rrg.setOutputFile(options.outputFileName) + if options.inputFileName == None: + if options.size != None: + rrg.setSize(options.size) + else: + rrg.setMinSize(options.minSize) + rrg.setMaxSize(options.maxSize) + rrg.setNumber(options.number) + rrg.setStrands(options.strands) + else: + rrg.setAnnotation(options.inputFileName, options.format) + rrg.setMaxDistribution(options.max) + rrg.setDeviationDistribution(options.deviation) + rrg.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getReadDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getReadDistribution.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,129 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Plot the data from the data files +""" +import os +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + + +if __name__ == "__main__": + + # parse command line + description = "Get Read Distribution v1.0.1: Plot the number of identical reads and give the most represented. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file sequence [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: sequence file format]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="keep the best n [format: int]") + parser.add_option("-p", "--percent", dest="percent", action="store", default=None, type="float", help="keep the best n\% [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output files in PNG format and txt format]") + parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + if options.working_Dir[-1] != '/': + options.outputFileName = options.working_Dir + '/' + options.outputFileName + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + else: + raise Exception("Do not understand '%s' file format." % (options.format)) + + progress = Progress(parser.getNbSequences(), "Reading %s" % (options.inputFileName), options.verbosity) + sequences = {} + for sequence in parser.getIterator(): + sequence = sequence.sequence + if sequence not in sequences: + sequences[sequence] = 1 + else: + sequences[sequence] += 1 + progress.inc() + progress.done() + + values = sequences.values() + values.sort() + if options.percent != None: + threshold = values[int(float(options.percent) / 100 * len(values))] + elif options.number != None: + threshold = values[-options.number] + else: + threshold = 0 + + # sort by value + progress = Progress(parser.getNbSequences(), "Sorting values", options.verbosity) + sortedValues = dict([(value, []) for value in sequences.values()]) + for sequence, value in sequences.iteritems(): + sortedValues[value].append(sequence) + progress.inc() + progress.done() + + outputFileName = "%s.txt" % (options.outputFileName) + handle = open(outputFileName, "w") + progress = Progress(parser.getNbSequences(), "Writing into %s" % (outputFileName), options.verbosity) + for value in reversed(sorted(sortedValues.keys())): + if value >= threshold: + for sequence in sortedValues[value]: + handle.write("%s\t%d\n" % (sequence, value)) + progress.inc() + progress.done() + handle.close() + + line = {} + progress = Progress(len(values), "Preparing plot", options.verbosity) + for value in values: + if value not in line: + line[value] = 1 + else: + line[value] += 1 + progress.inc() + progress.done() + + plot = RPlotter("%s.png" % (options.outputFileName), options.verbosity) + plot.setFill(0) + plot.setMaximumX(options.xMax) + plot.setXLabel("# occurrences") + plot.setYLabel("# reads") + plot.addLine(line) + plot.plot() + + if options.verbosity > 0: + print "%d/%.2f/%.1f/%d occurrences" % (Utils.getMinAvgMedMax(line)) + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getSequence.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getSequence.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,60 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get a given sequence in a multi-Fasta file""" +import sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.FastaWriter import FastaWriter + +if __name__ == "__main__": + + # parse command line + description = "Get Sequence v1.0.1: Get a single sequence in a FASTA file. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName",action="store",type="string", help="multi-FASTA file [compulsory] [format: file in FASTA format]") + parser.add_option("-n", "--name",dest="name",action="store",type="string", help="name of the sequence [compulsory] [format: string]") + parser.add_option("-o", "--output",dest="outputFileName",action="store",type="string", help="output sequence file (FASTA) [compulsory] [format: file in FASTA format]") + parser.add_option("-v", "--verbosity", dest="verbosity",action="store",default=1,type="int",help="trace level [format: int]") + (options, args) = parser.parse_args() + + # read Fasta file + sequenceListParser = FastaParser(options.inputFileName, options.verbosity) + for sequence in sequenceListParser.getIterator(): + name = sequence.name.split(" ")[0] + if name == options.name: + writer = FastaWriter(options.outputFileName, options.verbosity) + writer.addSequence(sequence) + print sequence.printFasta(), + sys.exit(0) + writer.close() + print "No sequence found" diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getSizes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getSizes.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,238 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc import Utils + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser + +LOG_DEPTH = "smart" + +class GetSizes(object): + + def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, csv=False, verbosity = 0): + self.inFileName = inFileName + self.inFormat= inFormat + self.outFileName = outFileName + self.query = query + self.xMax = xMax + self.xMin = xMin + self.xLab = "Size" + self.yLab = "# reads" + self.barplot = False + self.csv = csv + self._verbosity = verbosity + self.parser = None + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n" + epilog = "" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") + parser.add_option("-q", "--query", dest="query", action="store", default=None, type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") + parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") + parser.add_option("-X", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-c", "--csv", dest="csv", action="store", type="string", help="write a .csv file [format: bool] [default: false]") + parser.add_option("-a", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x absis label name [format: string] [default: Size]") + parser.add_option("-b", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y absis label name [format: string] [default: Reads]") + parser.add_option("-B", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]") + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setInFileName(options.inputFileName) + self.setInFormat(options.format) + self.setQuery(options.query) + self.setOutFileName(options.outputFileName) + self.setXMax(options.xMax) + self.setXMin(options.xMin) + self.setxLab(options.xLab) + self.setyLab(options.yLab) + self.setBarplot(options.barplot) + self.setVerbosity(options.verbosity) + + def setInFileName(self, inputFileName): + self.inFileName = inputFileName + + def setInFormat(self, inFormat): + self.inFormat = inFormat + + def setQuery(self, query): + self.query = query + + def setOutFileName(self, outFileName): + self.outFileName = outFileName + + def setXMax(self, xMax): + self.xMax = xMax + + def setXMin(self, xMin): + self.xMin = xMin + + def setxLab(self, xLab): + self.xLab = xLab + + def setyLab(self, yLab): + self.yLab = yLab + + def setBarplot(self, barplot): + self.barplot = barplot + + def setCsv(self, csv): + self.csv = csv + + def setVerbosity(self, verbosity): + self._verbosity = verbosity + + def _checkOptions(self): + if self.inFileName == None: + self._logAndRaise("ERROR: Missing input file name") + if self.inFormat == "fasta": + self.parser = FastaParser(self.inFileName, self._verbosity) + elif self.inFormat == "fastq": + self.parser = FastqParser(self.inFileName, self._verbosity) + else: + self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity) + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def run(self): + LoggerFactory.setLevel(self._log, self._verbosity) + self._checkOptions() + self._log.info("START getsizes") + self._log.debug("Input file name: %s" % self.inFileName) + + nbItems = self.parser.getNbItems() + self._log.info( "%i items found" % (nbItems)) + + # treat items + progress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity) + sizes = {} + names = {} + minimum = 1000000000000 + maximum = 0 + sum = 0 + number = 0 + nbSubItems = 0 + for item in self.parser.getIterator(): + items = [] + if self.query == "exon": + items = item.getExons() + elif self.query == "exon1": + if len(item.getExons()) > 1: + item.sortExons() + items = [item.getExons()[0]] + elif self.query == "intron": + items = item.getIntrons() + else: + items = [item, ] + + for thisItem in items: + try: + nbElements = int(float(thisItem.getTagValue("nbElements"))) + if nbElements == None: + nbElements = 1 + except: + nbElements = 1 + size = thisItem.getSize() + minimum = min(minimum, size) + maximum = max(maximum, size) + name = thisItem.name.split()[0] + + if size not in sizes: + sizes[size] = nbElements + if self.csv: + names[size] = [name, ] + else: + sizes[size] += nbElements + if self.csv: + names[size].append(name) + sum += size + nbSubItems += nbElements + number += 1 + progress.inc() + progress.done() + + if self.outFileName != None: + plotter = RPlotter(self.outFileName, self._verbosity) + plotter.setFill(0) + plotter.setMinimumX(self.xMin) + plotter.setMaximumX(self.xMax) + plotter.setXLabel(self.xLab) + plotter.setYLabel(self.yLab) + plotter.setBarplot(self.barplot) + plotter.addLine(sizes) + plotter.plot() + + if nbSubItems == 0: + self._logAndRaise("No item found") + + if self.csv: + csvHandle = open(self.csv, "w") + for size in range(min(sizes.keys()), max(sizes.keys())+1): + if size not in sizes: + csvHandle.write("%d,0,\n" % (size)) + else: + csvHandle.write("%d,%d,%s\n" % (size, sizes[size], ";".join(names[size]))) + csvHandle.close() + + self.items = number + self.subItems = nbSubItems + self.nucleotides = sum + self.minAvgMedMax = Utils.getMinAvgMedMax(sizes) + + print "%d items" % (number) + print "%d sub-items" % (nbSubItems) + print "%d nucleotides" % (sum) + print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes) + + self._log.info("END getsizes") + + +if __name__ == "__main__": + iGetSizes = GetSizes() + iGetSizes.setAttributesFromCmdLine() + iGetSizes.run() + +#TODO: add two more options!!!!!! diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getWigData.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getWigData.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,67 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.WigParser import WigParser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Get WIG Data v1.0.1: Compute the average data for some genomic coordinates using WIG files (thus covering a large proportion of the genome) and update a tag. [Category: WIG Tools]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-w", "--wig", dest="wig", action="store", type="string", help="wig file name [compulsory] [format: file in WIG format]") + parser.add_option("-t", "--tag", dest="tag", action="store", type="string", help="choose a tag name to write the wig information to output file [compulsory] [format: file in WIG format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider both strands separately [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + # create parsers and writers + transcriptParser = TranscriptContainer(options.inputFileName, options.inputFormat, options.verbosity) + wigParser = WigParser(options.wig) + writer = Gff3Writer(options.outputFileName, options.verbosity) + wigParser.setStrands(options.strands) + + progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + for transcript in transcriptParser.getIterator(): + values = transcript.extractWigData(wigParser) + if options.strands: + values = values[transcript.getDirection()] + transcript.setTagValue(options.tag, str(float(sum(values)) / len(values))) + writer.addTranscript(transcript) + progress.inc() + progress.done() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getWigDistance.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getWigDistance.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,105 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. +""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.WigParser import WigParser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter + + +if __name__ == "__main__": + + # parse command line + description = "Get WIG Data v1.0.2: Compute the average data around some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-w", "--wig", dest="wig", action="store", type="string", help="wig file name [compulsory] [format: file in WIG format]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=1000, type="int", help="distance around position [compulsory] [format: int] [default: 1000]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider both strands separately [format: boolean] [default: False]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-a", "--default", dest="defaultValue", action="store", default=0.0, type="float", help="default value (when value is NA) [default: 0.0] [format: float]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="use log scale for y-axis [format: boolean] [default: False]") + parser.add_option("-k", "--keep", dest="keep", action="store_true", default=False, help="keep temporary files [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + # create parsers and writers + transcriptParser = TranscriptContainer(options.inputFileName, options.inputFormat, options.verbosity) + wigParser = WigParser(options.wig) + wigParser.setStrands(options.strands) + wigParser.setDefaultValue(options.defaultValue) + + # allocate data + strands = (1, -1) if options.strands else (1, ) + values = {} + for strand in strands: + values[strand] = dict([(i, 0.0) for i in range(-options.distance, options.distance+1)]) + + # read transcripts + progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + for transcript in transcriptParser.getIterator(): + transcript.removeExons() + transcript.restrictStart(2) + transcript.extendStart(options.distance) + transcript.extendEnd(options.distance-1) + theseValues = transcript.extractWigData(wigParser) + if len(strands) == 1: + theseValues = {1: theseValues} + for strand in strands: + if len(theseValues[strand]) < 2 * options.distance + 1: + theseValues[strand] = [options.defaultValue] * (2 * options.distance + 1 - len(theseValues[strand])) + theseValues[strand] + if len(theseValues[strand]) != 2 * options.distance + 1: + raise Exception("Got something wrong with the size of the WIG data concerning %s: %d found instead of %d" % (transcript, len(theseValues[strand]), 2 * options.distance + 1)) + for i in range(-options.distance, options.distance+1): + values[strand][i] += theseValues[strand][i + options.distance] + progress.inc() + progress.done() + + for strand in strands: + for i in range(-options.distance, options.distance+1): + values[strand][i] /= transcriptParser.getNbTranscripts() * strand + + # draw plot + plotter = RPlotter(options.outputFileName, options.verbosity, options.keep) + plotter.setXLabel("Distance") + plotter.setYLabel("WigValue") + for strand in strands: + plotter.addLine(values[strand]) + if options.log: + plotter.setLog("y") + plotter.plot() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/getWigProfile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getWigProfile.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,160 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. +""" + +import math +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.WigParser import WigParser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter + +class GetWigProfile(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.values = {} + self.defaultValue = 0.0 + + def _iToJ(self, i, size): + return min(self.nbPoints+1, int(math.floor(float(i - self.distance) / (size) * (self.nbPoints)))) + + def readTranscripts(self): + self.strandNames = (1, -1) if self.strands else (1, ) + self.values = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames]) + transcriptParser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity) + wigParser = WigParser(self.wig) + nbValues = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames]) + wigParser.setStrands(self.strands) + wigParser.setDefaultValue(self.defaultValue) + + progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (self.inputFileName), self.verbosity) + for transcript in transcriptParser.getIterator(): + transcriptSize = transcript.getSize() + expectedSize = transcriptSize + 2 * self.distance + transcript.extendStart(self.distance) + transcript.extendEnd(self.distance) + theseValues = transcript.extractWigData(wigParser) + + if len(self.strandNames) == 1: + theseValues = {1: theseValues} + for strand in self.strandNames: + if len(theseValues[strand]) < expectedSize: + theseValues[strand] = [self.defaultValue] * (expectedSize - len(theseValues[strand])) + theseValues[strand] + if len(theseValues[strand]) != expectedSize: + raise Exception("Got something wrong with the size of the WIG data concerning %s [%s]: %d found instead of %d" % (transcript, ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in transcript.getExons()]), len(theseValues[strand]), expectedSize)) + fivePValues = theseValues[strand][: self.distance] + nbValues = [0.0] * (self.nbPoints) + transcriptValues = [0.0] * (self.nbPoints) + for i in range(self.distance, len(theseValues[strand]) - self.distance): + startJ = self._iToJ(i, transcriptSize) + endJ = max(startJ+1, self._iToJ(i+1, transcriptSize)) + for j in range(startJ, endJ): + transcriptValues[j] += theseValues[strand][i] + nbValues[j] += 1 + threePValues = theseValues[strand][-self.distance: ] + values = fivePValues + [self.defaultValue if nbValue == 0 else transcriptValue / nbValue for transcriptValue, nbValue in zip(transcriptValues, nbValues)] + threePValues + for i, value in enumerate(values): + self.values[strand][i] += value + progress.inc() + progress.done() + + for strand in self.strandNames: + if strand == 0: + strand = 1 + for i in range(self.nbPoints + 2 * self.distance): + self.values[strand][i] /= transcriptParser.getNbTranscripts() * strand + + + def smoothen(self): + if self.smoothenForce == None: + return + for strand in self.strandNames: + averageValues = {} + for center in range(self.distance, self.distance + self.nbPoints): + sum = 0.0 + nbValues = 0.0 + for i in range(center - self.smoothenForce + 1, center + self.smoothenForce): + if i > self.distance and i < self.distance + self.nbPoints: + nbValues += 1 + sum += self.values[strand][i] + averageValues[center] = sum / nbValues + for position in range(self.distance, self.distance + self.nbPoints): + self.values[strand][position] = averageValues[position] + + + def plot(self): + plotter = RPlotter(self.outputFileName, self.verbosity) + for strand in self.strandNames: + plotter.addLine(self.values[strand]) + if self.log: + plotter.setLog("y") + plotter.setAxisLabel("x", {0: -self.distance, self.distance: "start", self.distance+self.nbPoints-1: "end", 2*self.distance+self.nbPoints-1: self.distance}) + plotter.plot() + + + +if __name__ == "__main__": + + # parse command line + description = "Get WIG Profile v1.0.1: Compute the average profile of some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-w", "--wig", dest="wig", action="store", type="string", help="wig file name [compulsory] [format: file in WIG format]") + parser.add_option("-p", "--nbPoints", dest="nbPoints", action="store", default=1000, type="int", help="number of points on the x-axis [compulsory] [format: int] [default: 1000]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="distance around genomic coordinates [compulsory] [format: int] [default: 0]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider both strands separately [format: boolean] [default: False]") + parser.add_option("-m", "--smoothen", dest="smoothen", action="store", default=None, type="int", help="smoothen the curve [format: int] [default: None]") + parser.add_option("-a", "--default", dest="defaultValue", action="store", default=0.0, type="float", help="default value (when value is NA) [default: 0.0] [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="use log scale for y-axis [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + wigProfile = GetWigProfile(options.verbosity) + wigProfile.strands = options.strands + wigProfile.inputFileName = options.inputFileName + wigProfile.inputFormat = options.inputFormat + wigProfile.wig = options.wig + wigProfile.nbPoints = options.nbPoints + wigProfile.distance = options.distance + wigProfile.smoothenForce = options.smoothen + wigProfile.defaultValue = options.defaultValue + wigProfile.outputFileName = options.outputFileName + wigProfile.log = options.log + + wigProfile.readTranscripts() + wigProfile.smoothen() + wigProfile.plot() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mapperAnalyzer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mapperAnalyzer.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,486 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Read a mapping file (many formats supported) and select some of them +Mappings should be sorted by read names +""" +import os, random, shelve +from optparse import OptionParser, OptionGroup +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.BedWriter import BedWriter +from commons.core.writer.UcscWriter import UcscWriter +from commons.core.writer.GbWriter import GbWriter +from commons.core.writer.Gff2Writer import Gff2Writer +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from SMART.Java.Python.mySql.MySqlTable import MySqlTable +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + + +distanceExons = 20 +exonSize = 20 + + +class MapperAnalyzer(object): + """ + Analyse the output of a parser + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.mySqlConnection = MySqlConnection(verbosity) + self.tooShort = 0 + self.tooManyMismatches = 0 + self.tooManyGaps = 0 + self.tooShortExons = 0 + self.tooManyMappings = 0 + self.nbMappings = 0 + self.nbSequences = 0 + self.nbAlreadyMapped = 0 + self.nbAlreadyMappedSequences = 0 + self.nbWrittenMappings = 0 + self.nbWrittenSequences = 0 + self.parser = None + self.logHandle = None + self.randomNumber = random.randint(0, 100000) + self.gff3Writer = None + self.alreadyMappedReader = None + self.unmatchedWriter = None + self.sequenceListParser = None + self.sequences = None + self.alreadyMapped = None + self.mappedNamesTable = None + self.minSize = None + self.minId = None + self.maxMismatches = None + self.maxGaps = None + self.maxMappings = None + self.merge = False + self.checkExons = False + self.suffix = None + self.tmpDirectory = "%s%s" % (os.environ["SMARTMPPATH"], os.sep) if "SMARTMPPATH" in os.environ else "" + + + def __del__(self): + if self.sequences != None: + self.sequences.close() + if self.alreadyMapped != None: + self.alreadyMapped.close() + if self.mappedNamesTable != None: + self.mappedNamesTable.remove() + if self.gff3Writer != None: + self.gff3Writer.close() + + if self.logHandle != None: + self.logHandle.close() + + + def setMappingFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format, "mapping") + self.parser = parserChooser.getParser(fileName) + + + def setSequenceFile(self, fileName, format): + if format == "fasta": + self.sequenceListParser = FastaParser(fileName, self.verbosity) + elif format == "fastq": + self.sequenceListParser = FastqParser(fileName, self.verbosity) + else: + raise Exception("Do not understand sequence format %s" % (format)) + + + def setOutputFile(self, fileName, title): + self.gff3Writer = Gff3Writer(fileName, self.verbosity) + self.gff3Writer.setTitle(title) + + + def setAlreadyMatched(self, fileName): + self.alreadyMappedReader = GffParser(fileName, self.verbosity) + + + def setRemainingFile(self, fileName, format): + if format == "fasta": + self.unmatchedWriter = FastaWriter("%s_unmatched.fasta" % (fileName), self.verbosity) + elif format == "fastq": + self.unmatchedWriter = FastqWriter("%s_unmatched.fastq" % (fileName), self.verbosity) + else: + raise Exception("Do not understand %s format." % (format)) + self.mappedNamesTable = MySqlTable(self.mySqlConnection, "mappedNames_%d" % (self.randomNumber), self.verbosity) + self.mappedNamesTable.create(["name"], {"name": "char"}, {"name": 50}) + self.mappedNamesTable.createIndex("iNameMapped", ["name", ], True) + + + def setLog(self, fileName): + self.logHandle = open(fileName, "w") + + + def setMinSize(self, size): + self.minSize = size + + + def setMinId(self, id): + self.minId = id + + + def setMaxMismatches(self, mismatches): + self.maxMismatches = mismatches + + + def setMaxGaps(self, gaps): + self.maxGaps = gaps + + + def setMaxMappings(self, mappings): + self.maxMappings = mappings + + + def mergeExons(self, b): + self.merge = b + + + def acceptShortExons(self, b): + self.checkExons = not b + + + def countMappings(self): + self.nbMappings = self.parser.getNbMappings() + if self.verbosity > 0: + print "%i matches found" % (self.nbMappings) + + + def storeAlreadyMapped(self): + self.alreadyMapped = shelve.open("%stmpAlreadyMapped_%d" % (self.tmpDirectory, self.randomNumber)) + progress = Progress(self.alreadyMappedReader.getNbTranscripts(), "Reading already mapped reads", self.verbosity) + self.nbAlreadyMappedSequences = 0 + for transcript in self.alreadyMappedReader.getIterator(): + if not self.alreadyMapped.has_key(transcript.getName()): + self.alreadyMapped[transcript.getName()] = 1 + self.nbAlreadyMappedSequences += 1 + progress.inc() + progress.done() + self.nbAlreadyMapped = self.alreadyMappedReader.getNbTranscripts() + + + def storeSequences(self): + self.sequences = shelve.open("%stmpSequences_%d" % (self.tmpDirectory, self.randomNumber)) + progress = Progress(self.sequenceListParser.getNbSequences(), "Reading sequences", self.verbosity) + for sequence in self.sequenceListParser.getIterator(): + self.sequences[sequence.getName().split(" ")[0]] = len(sequence.getSequence()) + self.nbSequences += 1 + progress.inc() + progress.done() + if self.verbosity > 0: + print "%i sequences read" % (self.nbSequences) + + + def checkOrder(self): + names = shelve.open("%stmpNames_%d" % (self.tmpDirectory, self.randomNumber)) + previousName = None + progress = Progress(self.nbMappings, "Checking mapping file", self.verbosity) + for mapping in self.parser.getIterator(): + name = mapping.queryInterval.getName() + if name != previousName and previousName != None: + if names.has_key(previousName): + raise Exception("Error! Input mapping file is not ordered! (Name '%s' occurs at least twice)" % (previousName)) + names[previousName] = 1 + previousName = name + progress.inc() + progress.done() + names.close() + + + def checkPreviouslyMapped(self, name): + if self.alreadyMappedReader == None: + return False + return self.alreadyMapped.has_key(name) + + + def findOriginalSize(self, name): + alternate = "%s/1" % (name) + if (self.suffix == None) or (not self.suffix): + if self.sequences.has_key(name): + self.suffix = False + return self.sequences[name] + if self.suffix == None: + self.suffix = True + else: + raise Exception("Cannot find name %n" % (name)) + if (self.suffix): + if self.sequences.has_key(alternate): + return self.sequences[alternate] + raise Exception("Cannot find name %s" % (name)) + + + def checkErrors(self, mapping): + accepted = True + # short size + if self.minSize != None and mapping.size * 100 < self.minSize * mapping.queryInterval.size: + self.tooShort += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("size of mapping %s is too short (%i instead of %i)\n" % (str(mapping), mapping.queryInterval.size, mapping.size)) + # low identity + if self.minId != None and mapping.getTagValue("identity") < self.minId: + self.tooManyMismatches += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("mapping %s has a low identity rate\n" % (str(mapping))) + # too many mismatches + if self.maxMismatches != None and mapping.getTagValue("nbMismatches") > self.maxMismatches: + self.tooManyMismatches += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("mapping %s has more mismatches than %i\n" % (str(mapping), self.maxMismatches)) + # too many gaps + if self.maxGaps != None and mapping.getTagValue("nbGaps") > self.maxGaps: + self.tooManyGaps += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("mapping %s has more gaps than %i\n" % (str(mapping), self.maxGaps)) + # short exons + if self.checkExons and len(mapping.subMappings) > 1 and min([subMapping.targetInterval.getSize() for subMapping in mapping.subMappings]) < exonSize: + self.tooShortExons += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("sequence %s maps as too short exons\n" % (mapping)) + return accepted + + + def checkNbMappings(self, mappings): + nbOccurrences = 0 + for mapping in mappings: + nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences") + if (self.maxMappings != None and nbOccurrences > self.maxMappings): + self.tooManyMappings += 1 + if self.logHandle != None: + self.logHandle.write("sequence %s maps %i times\n" % (mappings[0].queryInterval.getName(), nbOccurrences)) + return False + return (nbOccurrences > 0) + + + def sortMappings(self, mappings): + nbOccurrences = 0 + for mapping in mappings: + nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences") + + orderedMappings = sorted(mappings, key = lambda mapping: mapping.getErrorScore()) + cpt = 1 + rank = 1 + previousMapping = None + previousScore = None + wasLastTie = False + rankedMappings = [] + bestRegion = "%s:%d-%d" % (orderedMappings[0].targetInterval.getChromosome(), orderedMappings[0].targetInterval.getStart(), orderedMappings[0].targetInterval.getEnd()) + for mapping in orderedMappings: + mapping.setNbOccurrences(nbOccurrences) + mapping.setOccurrence(cpt) + + score = mapping.getErrorScore() + if previousScore != None and previousScore == score: + if "Rank" in previousMapping.getTagNames(): + if not wasLastTie: + previousMapping.setRank("%sTie" % (rank)) + mapping.setRank("%sTie" % (rank)) + wasLastTie = True + else: + rank = cpt + mapping.setRank(rank) + wasLastTie = False + if cpt != 1: + mapping.setBestRegion(bestRegion) + + rankedMappings.append(mapping) + previousMapping = mapping + previousScore = score + cpt += 1 + return rankedMappings + + + def processMappings(self, mappings): + if not mappings: + return + selectedMappings = [] + name = mappings[0].queryInterval.getName() + size = self.findOriginalSize(name) + for mapping in mappings: + if self.merge: + mapping.mergeExons(distanceExons) + mapping.queryInterval.size = size + if self.checkErrors(mapping): + selectedMappings.append(mapping) + + if self.checkNbMappings(selectedMappings): + if self.unmatchedWriter != None: + query = self.mySqlConnection.executeQuery("INSERT INTO %s (name) VALUES ('%s')" % (self.mappedNamesTable.name, name if not self.suffix else "%s/1" % (name))) + self.nbWrittenSequences += 1 + mappings = self.sortMappings(selectedMappings) + for mapping in mappings: + self.nbWrittenMappings += 1 + self.gff3Writer.addTranscript(mapping.getTranscript()) + + + def readMappings(self): + previousQueryName = None + mappings = [] + self.parser.reset() + progress = Progress(self.nbMappings, "Reading mappings", self.verbosity) + for mapping in self.parser.getIterator(): + queryName = mapping.queryInterval.getName().split(" ")[0] + if self.checkPreviouslyMapped(queryName): + if self.logHandle != None: + self.logHandle.write("Mapping %s has already been mapped.\n" % (queryName)) + else: + if previousQueryName == queryName: + mappings.append(mapping) + else: + if previousQueryName != None: + self.processMappings(mappings) + previousQueryName = queryName + mappings = [mapping, ] + progress.inc() + self.processMappings(mappings) + self.gff3Writer.write() + self.gff3Writer.close() + progress.done() + + + def writeUnmatched(self): + progress = Progress(self.nbSequences, "Reading unmatched sequences", self.verbosity) + for sequence in self.sequenceListParser.getIterator(): + name = sequence.getName().split(" ")[0] + query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE name = '%s' LIMIT 1" % (self.mappedNamesTable.name, name)) + if query.isEmpty(): + self.unmatchedWriter.addSequence(sequence) + progress.inc() + progress.done() + + + def analyze(self): + self.countMappings() + self.checkOrder() + self.storeSequences() + if self.alreadyMappedReader != None: + self.storeAlreadyMapped() + self.readMappings() + if self.unmatchedWriter != None: + self.writeUnmatched() + + + + +if __name__ == "__main__": + + # parse command line + description = "Mapper Analyzer v1.0.1: Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. [Category: Conversion]" + + parser = OptionParser(description = description) + compGroup = OptionGroup(parser, "Compulsory options") + filtGroup = OptionGroup(parser, "Filtering options") + tranGroup = OptionGroup(parser, "Transformation options") + outpGroup = OptionGroup(parser, "Output options") + otheGroup = OptionGroup(parser, "Other options") + compGroup.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file (output of the tool) [compulsory] [format: file in mapping format given by -f]") + compGroup.add_option("-f", "--format", dest="format", action="store", default="seqmap", type="string", help="format of the file [compulsory] [format: mapping file format]") + compGroup.add_option("-q", "--sequences", dest="sequencesFileName", action="store", type="string", help="file of the sequences [compulsory] [format: file in sequence format given by -k]") + compGroup.add_option("-k", "--seqFormat", dest="sequenceFormat", action="store", default="fasta", type="string", help="format of the sequences: fasta or fastq [default: fasta] [format: sequence file format]") + compGroup.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + filtGroup.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="max. number of occurrences of a sequence [format: int]") + filtGroup.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="minimum pourcentage of size [format: int]") + filtGroup.add_option("-d", "--identity", dest="identity", action="store", default=None, type="int", help="minimum pourcentage of identity [format: int]") + filtGroup.add_option("-m", "--mismatch", dest="mismatch", action="store", default=None, type="int", help="maximum number of mismatches [format: int]") + filtGroup.add_option("-p", "--gap", dest="gap", action="store", default=None, type="int", help="maximum number of gaps [format: int]") + tranGroup.add_option("-e", "--mergeExons", dest="mergeExons", action="store_true", default=False, help="merge exons when introns are short [format: bool] [default: false]") + tranGroup.add_option("-x", "--removeExons", dest="removeExons", action="store_true", default=False, help="remove transcripts when exons are short [format: bool] [default: false]") + outpGroup.add_option("-t", "--title", dest="title", action="store", default="SMART", type="string", help="title of the UCSC track [format: string] [default: SMART]") + outpGroup.add_option("-r", "--remaining", dest="remaining", action="store_true", default=False, help="print the unmatched sequences [format: bool] [default: false]") + otheGroup.add_option("-a", "--append", dest="appendFileName", action="store", default=None, type="string", help="append to GFF3 file [format: file in GFF3 format]") + otheGroup.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + otheGroup.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + parser.add_option_group(compGroup) + parser.add_option_group(filtGroup) + parser.add_option_group(tranGroup) + parser.add_option_group(outpGroup) + parser.add_option_group(otheGroup) + (options, args) = parser.parse_args() + + + analyzer = MapperAnalyzer(options.verbosity) + analyzer.setMappingFile(options.inputFileName, options.format) + analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat) + analyzer.setOutputFile(options.outputFileName, options.title) + if options.appendFileName != None: + analyzer.setAlreadyMatched(options.appendFileName) + if options.remaining: + analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat) + if options.number != None: + analyzer.setMaxMappings(options.number) + if options.size != None: + analyzer.setMinSize(options.size) + if options.identity != None: + analyzer.setMinId(options.identity) + if options.mismatch != None: + analyzer.setMaxMismatches(options.mismatch) + if options.gap != None: + analyzer.setMaxGaps(options.gap) + if options.mergeExons: + analyzer.mergeExons(True) + if options.removeExons: + analyzer.acceptShortExons(False) + if options.log: + analyzer.setLog("%s.log" % (options.outputFileName)) + analyzer.analyze() + + if options.verbosity > 0: + print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100) + if options.appendFileName != None: + print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100) + print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100) + if options.appendFileName != None: + print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100) + print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100) + print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100) + print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100) + print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100) + print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100) + print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100) + if options.appendFileName != None: + print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100) + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mappingToCoordinates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mappingToCoordinates.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,91 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + + +"""Convert files with some mapping format to coordinates format""" + +import os +from optparse import OptionParser +from commons.core.parsing.PslParser import PslParser +from commons.core.parsing.AxtParser import AxtParser +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress + + +class MappingToCoordinates(object): + def __init__(self,verbosity=1, inputFileName=None, format = None, output=None,galaxy = False, title="S-MART"): + self.verbosity = verbosity + self.inputFileName = inputFileName + self.format = format + self.output = output + self.galaxy = galaxy + self.title = title + + def setAttributesFromCmdLine(self): + description = "Mapping To Coordinates v1.0.1: Convert a set of mappings (given by a mapping tool) to a set of transcripts. [Category: Conversion]" + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in mapping format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: mapping file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-G", "--galaxy", dest="galaxy", action="store_true", default=False, help="used for galaxy [format: bool] [default: False]") + (options, args) = parser.parse_args() + + self.verbosity = options.verbosity + self.inputFileName = options.inputFileName + self.format = options.format + self.output = options.output + self.galaxy = options.galaxy + + def run(self): + if self.verbosity > 0: + print "Reading input file..." + parser = TranscriptContainer(self.inputFileName, self.format, self.verbosity) + if self.verbosity > 0: + print "... done" + writer = Gff3Writer(self.output, self.verbosity, self.title) + + progress = Progress(parser.getNbTranscripts(), "Reading %s" % (self.inputFileName), self.verbosity) + for transcript in parser.getIterator(): + writer.addTranscript(transcript) + progress.inc() + progress.done() + + if self.galaxy: + os.rename("%s.gff3" % (self.output), self.output) + +if __name__ == '__main__': + launcher = MappingToCoordinates() + launcher.setAttributesFromCmdLine() + launcher.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mergeSlidingWindowsClusters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mergeSlidingWindowsClusters.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,144 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Merge sliding windows of two different clusterings +""" + +import sys +import re +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.structure.Transcript import Transcript + +class MergeSlidingWindowsClusters(object): + """ + Merge the ouptput of several sets of sliding windows + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.inputs = [] + self.outputData = {} + self.nbData = 0 + self.nbWrittenData = 0 + self.chromosomes = [] + self.writer = None + + def __del__(self): + if self.writer != None: + self.writer.close() + + def addInput(self, fileName, fileFormat): + self.inputs.append(TranscriptContainer(fileName, fileFormat, self.verbosity)) + self.chromosomes = list(set(self.chromosomes).union(set(self.inputs[-1].getChromosomes()))) + + def setOutput(self, fileName): + self.writer = Gff3Writer(fileName, self.verbosity) + + def readInput(self, i, chromosome): + progress = Progress(self.inputs[i].getNbTranscripts(), "Reading file #%d -- chromosome %s" % (i+1, chromosome), self.verbosity) + for transcript in self.inputs[i].getIterator(): + progress.inc() + if chromosome != transcript.getChromosome(): continue + start = transcript.getStart() + end = transcript.getEnd() + direction = transcript.getDirection() + tags = transcript.tags + if chromosome not in self.outputData: + self.outputData[chromosome] = {} + if direction not in self.outputData[chromosome]: + self.outputData[chromosome][direction] = {} + if start not in self.outputData[chromosome][direction]: + self.outputData[chromosome][direction][start] = {} + if end in self.outputData[chromosome][direction][start]: + ends = self.outputData[chromosome][direction][start].keys() + if ends[0] != end: + sys.exit("Error! Two regions starting at %d end are not consistent (%d and %d) in %s on strand %d" % (start, end, ends[0], chromosome, direction)) + self.outputData[chromosome][direction][start][end].update(tags) + else: + self.outputData[chromosome][direction][start][end] = tags + self.nbData += 1 + progress.done() + + + def writeOutput(self, chromosome): + progress = Progress(self.nbData - self.nbWrittenData, "Writing output for chromosome %s" % (chromosome), self.verbosity) + for direction in self.outputData[chromosome]: + for start in self.outputData[chromosome][direction]: + for end in self.outputData[chromosome][direction][start]: + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setStart(start) + transcript.setEnd(end) + transcript.setDirection(direction) + transcript.tags = self.outputData[chromosome][direction][start][end] + transcript.setName("region_%d" % (self.nbWrittenData + 1)) + tags = transcript.getTagNames() + for tag in tags: + if tag.startswith("Name_") or tag.startswith("ID_"): + del transcript.tags[tag] + self.nbWrittenData += 1 + self.writer.addTranscript(transcript) + progress.inc() + self.writer.write() + progress.done() + self.outputData = {} + + def merge(self): + for chromosome in self.chromosomes: + for i, input in enumerate(self.inputs): + self.readInput(i, chromosome) + self.writeOutput(chromosome) + self.writer.close() + + +if __name__ == "__main__": + + # parse command line + description = "Merge Sliding Windows Clusters v1.0.2: Merge two files containing the results of a sliding windows clustering. [Category: Sliding Windows]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of the input file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of the input file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + merger = MergeSlidingWindowsClusters(options.verbosity) + merger.addInput(options.inputFileName1, options.inputFormat1) + merger.addInput(options.inputFileName2, options.inputFormat2) + merger.setOutput(options.outputFileName) + merger.merge() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mergeTranscriptLists.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mergeTranscriptLists.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,174 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Merge elements of two transcript lists with some condition""" + +import os, random, shutil, glob +from optparse import OptionParser +from commons.core.parsing.SequenceListParser import SequenceListParser +from commons.core.parsing.BedParser import BedParser +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress + + + +class MergeLists(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.seed = random.randint(0, 100000) + self.aggregation = False + self.normalization = False + self.distance = False + self.antisense = False + self.colinear = False + self.fileNames = {} + self.formats = {} + self.tmpFileNames = [] + self.logHandle = None + +# def __del__(self): +# for fileNameRoot in self.tmpFileNames: +# for fileName in glob.glob("%s*" % (fileNameRoot)): +# os.remove(fileName) +# if self.logHandle != None: +# self.logHandle.close() +# self.logHandle = None + + def setLogFileName(self, fileName): + self.logHandle = open(fileName, "w") + + def setInputFileName(self, fileName, format, id): + self.fileNames[id] = fileName + self.formats[id] = format + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setAggregate(self, aggregation): + self.aggregation = aggregation + + def setNormalization(self, normalization): + self.normalization = normalization + + def setDistance(self, distance): + self.distance = distance + + def setAntisense(self, antisense): + self.antisense = antisense + + def setColinear(self, colinear): + self.colinear = colinear + + def createTmpFileName(self, root): + fileName = "tmp_%s_%d.gff3" % (root, self.seed) + self.tmpFileNames.append(fileName) + return fileName + + def selfMerge(self, fileName, format, outputFileName): + transcriptListComparator = TranscriptListsComparator(self.logHandle, self.verbosity) + transcriptListComparator.getColinearOnly(True) + transcriptListComparator.setNormalization(self.normalization) + transcriptContainer = TranscriptContainer(fileName, format, self.verbosity) + writer = TranscriptWriter(outputFileName, "gff3", self.verbosity) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer) + transcriptListComparator.setOutputWriter(writer) + transcriptListComparator.compareTranscriptListSelfMerge() + + def keepOverlapping(self, fileNames, formats, outputFileName): + transcriptListComparator = TranscriptListsComparator(self.logHandle, self.verbosity) + transcriptListComparator.getAntisenseOnly(self.antisense) + transcriptListComparator.getColinearOnly(self.colinear) + for i in (0, 1): + transcriptContainer = TranscriptContainer(fileNames[i], formats[i], self.verbosity) + transcriptListComparator.setInputTranscriptContainer(i, transcriptContainer) + transcriptListComparator.aggregate(self.aggregation) + transcriptListComparator.setNormalization(self.normalization) + transcriptListComparator.setMaxDistance(self.distance) + writer = TranscriptWriter(outputFileName, "gff3", self.verbosity) + transcriptListComparator.setOutputWriter(writer) + transcriptListComparator.compareTranscriptList() + + def mergeFiles(self, fileName1, fileName2, outputFileName): + outputFile = open(outputFileName, "w") + shutil.copyfileobj(open(fileName1, "r"), outputFile) + shutil.copyfileobj(open(fileName2, "r"), outputFile) + outputFile.close() + + def run(self): + selectedFileQuery = self.createTmpFileName("query") + self.keepOverlapping({0: self.fileNames[0], 1: self.fileNames[0]}, {0: "gff3", 1: "gff3"}, selectedFileQuery) + mergeFileTarget = self.createTmpFileName("target") + self.selfMerge(self.fileNames[1], self.formats[1], mergeFileTarget) + if not self.aggregation: + overlapFile = self.createTmpFileName("overlap") + self.keepOverlapping({0: mergeFileTarget, 1: selectedFileQuery}, {0: "gff3", 1: "gff3"}, overlapFile) + mergeFileTarget = overlapFile + mergeFileMerged = self.createTmpFileName("merged") + self.mergeFiles(mergeFileTarget, selectedFileQuery, mergeFileMerged) + self.selfMerge(mergeFileMerged, "gff3", self.outputFileName) + + + +if __name__ == "__main__": + + # parse command line + description = "Merge Lists v1.0.3: Merge the elements of two lists of genomic coordinates. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", default=None, type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", default=None, type="string", help="format of file 2 [compulsory] [format: file in transcript format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-k", "--all", dest="all", action="store_true", default=False, help="print all the transcripts, not only those overlapping [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts [format: int] [default: 0]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]") + parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + +# ml = MergeLists(logHandle, options.verbosity) + + ml = MergeLists(0) + ml.setInputFileName(options.inputFileName1, options.format1, 0) + ml.setInputFileName(options.inputFileName2, options.format2, 1) + ml.setOutputFileName(options.outputFileName) + ml.setAntisense(options.antisense) + ml.setColinear(options.colinear) + ml.setAggregate(options.all) + ml.setNormalization(options.normalize) + ml.setDistance(options.distance) + ml.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/MultipleRPlotter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/misc/MultipleRPlotter.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,160 @@ +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os +import subprocess +import random +import math +from SMART.Java.Python.misc.RPlotter import RPlotter + +NBCOLORS = 9 + +""" +Plot multiple curves with RPlotter +""" + +class MultipleRPlotter(object): + """ + Plot some curves + @ivar fileName: name of the file + @type fileName: string + @ivar height: height of the file + @type height: int + @ivar width: width of the file + @type width: int + @ivar plots: plots to be included + @type plots: list of L{RPlotter{RPlotter}} + @ivar keep: keep script lines + @type keep: boolean + @ivar format: format of the file + @type format: string + """ + + def __init__(self, fileName, verbosity = 0, keep = False): + """ + Constructor + @param fileName: name of the file to produce + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + @param keep: keep temporary files + @type keep: boolean + """ + self.fileName = fileName + self.verbosity = verbosity + self.keep = keep + self.format = "png" + self.width = 1000 + self.height = 500 + self.plots = [] + self.scriptFileName = "tmpScript-%d.R" % (os.getpid()) + + def __del__(self): + """ + Destructor + Remove script files + """ + if not self.keep: + if os.path.exists(self.scriptFileName): + os.remove(self.scriptFileName) + outputFileName = "%sout" % (self.scriptFileName) + if os.path.exists(outputFileName): + os.remove(outputFileName) + + def setFormat(self, format): + """ + Set the format of the picture + @param format: the format + @type format: string + """ + if format not in ("png", "pdf", "jpeg", "bmp", "tiff"): + raise Exception("Format '%s' is not supported by RPlotter" % (format)) + self.format = format + + + def setWidth(self, width): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + """ + self.width = width + + + def setHeight(self, height): + """ + Set the dimensions of the image produced + @param height: heigth of the image + @type height: int + """ + self.height = height + + + def setImageSize(self, width, height): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + @param height: heigth of the image + @type height: int + """ + self.width = width + self.height = height + + def addPlot(self, plot): + """ + Add a plot + @param plots: plot to be included + @type plots: L{RPlotter{RPlotter}} + """ + self.plots.append(plot) + + def plot(self): + """ + Plot the figures + """ + scriptHandle = open(self.scriptFileName, "w") + scriptHandle.write("library(RColorBrewer)\n") + scriptHandle.write("colorPanel = brewer.pal(n=%d, name=\"Set1\")\n" % (NBCOLORS)) + scriptHandle.write("%s(%s = \"%s\", width = %d, height = %d, bg = \"white\")\n" % (self.format, "filename" if self.format != "pdf" else "file", self.fileName, self.width, self.height)) + scriptHandle.write("par(mfrow=c(%d, 1))\n" % (len(self.plots))) + for plot in self.plots: + scriptHandle.write(plot.getScript()) + scriptHandle.write("dev.off()\n") + scriptHandle.close() + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, self.scriptFileName) + status = subprocess.call(command, shell=True) + if status != 0: + self.keep = True + raise Exception("Problem with the execution of script file %s, status is: %s" % (self.scriptFileName, status)) + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/Progress.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/misc/Progress.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,93 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import time + +class Progress(object): + """Show the progress of a process""" + + def __init__(self, aim, message = "Progress", verbosity = 0): + self.aim = aim + self.progress = 0 + self.message = message + self.length = -1 + self.verbosity = verbosity + self.maxMessageSize = 50 + self.barSize = 80 + self.startTime = time.time() + self.elapsed = 0 + if len(self.message) > self.maxMessageSize: + self.message = self.message[0:self.maxMessageSize-3] + "..." + self.show() + + + def inc(self): + self.progress += 1 + self.show() + + + def getPrintableElapsedTime(self, time): + timeHou = int(time) / 3600 + timeMin = int(time) / 60 - 60 * timeHou + timeSec = int(time) % 60 + if timeHou > 0: + return "%3dh %2dm" % (timeHou, timeMin) + if timeMin > 0: + return "%2dm %2ds" % (timeMin, timeSec) + return "%2ds " % (timeSec) + + + def show(self): + if self.verbosity <= 0: + return + if self.aim == 0: + return + messageSize = len(self.message) + length = int(self.progress / float(self.aim) * self.barSize) + elapsed = int(time.time() - self.startTime) + if (length > self.length) or (elapsed > self.elapsed + 10): + self.length = length + self.elapsed = elapsed + string = "%s%s[%s%s] %d/%d" % (self.message, " " * max(0, self.maxMessageSize - messageSize), "=" * self.length, " " * (self.barSize - self.length), self.progress, self.aim) + if elapsed > 5: + done = float(self.progress) / self.aim + total = elapsed / done + remaining = total - elapsed + string += " ETA: %s " % (self.getPrintableElapsedTime(remaining)) + string += "\r" + sys.stdout.write(string) + sys.stdout.flush() + + + def done(self): + if self.verbosity > 0: + messageSize = len(self.message) + elapsed = time.time() - self.startTime + print "%s%s[%s] %d completed in %s " % (self.message, " " * max(0, self.maxMessageSize - messageSize), "=" * self.barSize, self.aim, self.getPrintableElapsedTime(elapsed)) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/Progress.pyc Binary file smart_toolShed/SMART/Java/Python/misc/Progress.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/RPlotter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/misc/RPlotter.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,820 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os +import subprocess +import random +import math + +minPositiveValue = 10e-6 + +""" +Plot simple curves in R +""" + +class RPlotter(object): + """ + Plot some curves + @ivar nbColors: number of different colors + @type nbColors: int + @ivar fileName: name of the file + @type fileName: string + @ivar lines: lines to be plotted + @type lines: array of dict + @ivar names: name of the lines + @type names: array of strings + @ivar colors: color of the lines + @type colors: array of strings + @ivar types: type of the lines (plain or dashed) + @type types: array of strings + @ivar format: format of the picture + @type format: string + @ivar lineWidth: width of the line in a xy-plot + @type lineWidth: int + @ivar xMin: minimum value taken on the x-axis + @type xMin: int + @ivar xMax: maximum value taken on the x-axis + @type xMax: int + @ivar yMin: minimum value taken on the y-axis + @type yMin: int + @ivar yMax: maximum value taken on the y-axis + @type yMax: int + @ivar minimumX: minimum value allowed on the x-axis + @type minimumX: int + @ivar maximumX: maximum value allowed on the x-axis + @type maximumX: int + @ivar minimumY: minimum value allowed on the y-axis + @type minimumY: int + @ivar maximumY: maximum value allowed on the y-axis + @type maximumY: int + @ivar leftMargin: add some margin in the left part of the plot + @type leftMargin: float + @ivar rightMargin: add some margin in the right part of the plot + @type rightMargin: float + @ivar downMargin: add some margin at the top of the plot + @type downMargin: float + @ivar upMargin: add some margin at the bottom of the plot + @type upMargin: float + @ivar logX: use log scale on the x-axis + @type logX: boolean + @ivar logY: use log scale on the y-axis + @type logY: boolean + @ivar logZ: use log scale on the z-axis (the color) + @type logZ: boolean + @ival fill: if a value is not given, fill it with given value + @type fill: int + @ival bucket: cluster the data into buckets of given size + @type bucket: int + @ival seed: a random number + @type seed: int + @ival regression: plot a linear regression + @type regression: boolean + @ival legend: set the legend + @type legend: boolean + @ival legendBySide: set the legend outside of the plot + @type legendBySde: boolean + @ival xLabel: label for the x-axis + @type xLabel: string + @ival yLabel: label for the y-axis + @type yLabel: string + @ival title: title of the plot + @type title: string + @ival barplot: use a barplot representation instead + @type barplot: boolean + @ival points: use a point cloud instead + @type points: boolean + @ival heatPoints: use a colored point cloud instead + @type heatPoints: boolean + @ival axesLabels: change the names of the axes + @type axesLabels: vector of 2 int to string dict + @ival rotateAxesLabels: rotate the axes labels + @type rotateAxesLabels: dict of 2 boolean + @ival verbosity: verbosity of the class + @type verbosity: int + @ival keep: keep temporary files + @type keep: boolean + """ + + def __init__(self, fileName, verbosity = 0, keep = False): + """ + Constructor + @param fileName: name of the file to produce + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + @param keep: keep temporary files + @type keep: boolean + """ + self.nbColors = 9 + self.fileName = fileName + self.verbosity = verbosity + self.keep = keep + self.format = "png" + self.fill = None + self.bucket = None + self.lines = [] + self.names = [] + self.colors = [] + self.types = [] + self.lineWidth = 1 + self.xMin = None + self.xMax = None + self.yMin = None + self.yMax = None + self.seed = random.randint(0, 10000) + self.minimumX = None + self.maximumX = None + self.minimumY = None + self.maximumY = None + self.leftMargin = 0 + self.rightMargin = 0 + self.topMargin = 0 + self.bottomMargin = 0 + self.logX = False + self.logY = False + self.logZ = False + self.regression = False + self.width = 1000 + self.height = 500 + self.legend = False + self.legendBySide = False + self.xLabel = "" + self.yLabel = "" + self.title = None + self.points = False + self.heatPoints = False + self.barplot = False + self.axesLabels = {1: None, 2: None} + self.rotateAxesLabels = {1: False, 2: False} + self.linesToAddBox = "" + + def __del__(self): + """ + Destructor + Remove tmp files + """ + if not self.keep: + scriptFileName = "tmpScript-%d.R" % (self.seed) + if os.path.exists(scriptFileName): + os.remove(scriptFileName) + outputFileName = "%sout" % (scriptFileName) + if os.path.exists(outputFileName): + os.remove(outputFileName) + nbLines = len(self.lines) + (1 if self.heatPoints else 0) + for i in range(nbLines): + if os.path.exists("tmpData-%d-%d.dat" % (self.seed, i)): + os.remove("tmpData-%d-%d.dat" % (self.seed, i)) + + + def setMinimumX(self, xMin): + """ + Set the minimum value on the x-axis + @param xMin:minimum value on the x-axis + @type xMin: int + """ + self.minimumX = xMin + + + def setMaximumX(self, xMax): + """ + Set the maximum value on the x-axis + @param xMax: maximum value on the x-axis + @type xMax: int + """ + self.maximumX = xMax + + + def setMinimumY(self, yMin): + """ + Set the minimum value on the y-axis + @param yMin: minimum value on the y-axis + @type yMin: int + """ + self.minimumY = yMin + + + def setMaximumY(self, yMax): + """ + Set the maximum value on the y-axis + @param yMax: maximum value on the y-axis + @type xmax: int + """ + self.maximumY = yMax + + + def setFill(self, fill): + """ + Fill empty data with given value + @param fill: the value to fill with + @type fill: int + """ + self.fill = fill + + + def setBuckets(self, bucket): + """ + Cluster the data into buckets of given size + @param bucket: the size of the buckets + @type bucket: int + """ + self.bucket = bucket + + + def setRegression(self, regression): + """ + Plot a linear regression line + @param regression: whether to plot the regression + @type regression: bool + """ + self.regression = regression + + + def setFormat(self, format): + """ + Set the format of the picture + @param format: the format + @type format: string + """ + if format not in ("png", "pdf", "jpeg", "bmp", "tiff"): + raise Exception("Format '%s' is not supported by RPlotter" % (format)) + self.format = format + + + def setWidth(self, width): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + """ + self.width = width + + + def setHeight(self, height): + """ + Set the dimensions of the image produced + @param height: heigth of the image + @type height: int + """ + self.height = height + + + def setImageSize(self, width, height): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + @param height: heigth of the image + @type height: int + """ + self.setWidth(width) + self.setHeight(height) + + + def setLegend(self, legend, bySide = False): + """ + Print a legend or not + @param legend: print a legend + @type legend: boolean + @param bySide: put the legend outside of the plot + @type bySide: boolean + """ + self.legend = legend + self.legendBySide = bySide + + + def setXLabel(self, label): + """ + Print a label for the x-axis + @param label: the label + @type label: string + """ + self.xLabel = label + if self.xLabel != None: + self.xLabel = self.xLabel.replace("_", " ") + + + def setYLabel(self, label): + """ + Print a label for the y-axis + @param label: the label + @type label: string + """ + self.yLabel = label + if self.yLabel != None: + self.yLabel = self.yLabel.replace("_", " ") + + + def addLeftMargin(self, margin): + """ + Increase the size of the space on the left part of the graph + @param margin: the space added + @type margin: float + """ + self.leftMargin = margin + + + def addRightMargin(self, margin): + """ + Increase the size of the space on the right part of the graph + @param margin: the space added + @type margin: float + """ + self.rightMargin = margin + + + def addTopMargin(self, margin): + """ + Increase the size of the space at the top of the graph + TopMargin is a percentage if 0 < TopMargin < 1. + TopMargin is a value if TopMargin >= 1. + @param margin: the space added + @type margin: float + """ + self.topMargin = margin + + + def addBottomMargin(self, margin): + """ + Increase the size of the space at the bottom of the graph + @param margin: the space added + @type margin: float + """ + self.bottomMargin = margin + + + def getNewYMaxWithTopMargin(self): + """ + Return new xMin coordinate with left margin + @param xMin: coordinate + @type xMin: float + """ + yMax = self.yMax + if 0 < self.topMargin and self.topMargin < 1: + topMargin = self.topMargin * self.yMax + yMax = self.yMax + topMargin + elif self.topMargin >= 1: + yMax = self.yMax + self.topMargin + return yMax + + + def setTitle(self, title): + """ + Print a title for graph + @param title: a title + @type title: string + """ + self.title = title + if self.title != None: + self.title = self.title.replace("_", " ") + + + def setAxisLabel(self, i, labels): + """ + Change x- or y-labels + @param i: x for x-label, y for y-label + @type i: string + @param labels: new labels + @type labels: int to string dict + """ + i = i.lower() + if i not in ("x", "y"): + raise Exception("Label name '" + i + "' should by 'x' or 'y' while changing axis labels.") + self.axesLabels[{"x": 1, "y": 2}[i]] = labels + + + def rotateAxisLabel(self, i, b = True): + """ + Rotate x- or y-labels + @param i: x for x-label, y for y-label + @type i: string + @param b: whether the labels should be rotated + @type b: boolean + """ + i = i.lower() + if i not in ("x", "y"): + raise Exception("Label name '" + i + "' should by 'x' or 'y' while rotating axis labels.") + self.rotateAxesLabels[{"x": 1, "y": 2}[i]] = b + + def setLineWidth(self, width): + """ + Set the line width in a xy-plot + @param width: the new line width + @type width: int + """ + self.lineWidth = width + + def setLog(self, log): + """ + Use log-scale for axes + @param log: use log scale + @type log: boolean + """ + self.logX = ("x" in log) + self.logY = ("y" in log) + self.logZ = ("z" in log) + + + def setBarplot(self, barplot): + """ + Use barplot representation instead + @param barplot: barplot representation + @type barplot: boolean + """ + self.barplot = barplot + + + def setPoints(self, points): + """ + Use points cloud representation instead + @param points: points cloud representation + @type points: boolean + """ + self.points = points + + + def setHeatPoints(self, heatPoints): + """ + Use points cloud representation with color representing another variable instead + @param points: colored points cloud representation + @type points: boolean + """ + self.heatPoints = heatPoints + + + def addBox(self, lXCoordList, minY, maxY): + for lXCoord in lXCoordList: + self.linesToAddBox += "rect(%s,%s,%s,%s,density=50, col='grey',border='transparent')\n" % (lXCoord[0], minY, lXCoord[1], maxY) + + def addLine(self, line, name = "", color = None): + """ + Add a line + @param line: a line to plot + @type line: dict + """ + # prepare data + plot = [] + if self.points or self.heatPoints: + values = line.values() + elif self.fill == None: + values = sorted(line.keys()) + else: + values = range(min(line.keys()), max(line.keys()) + 1) + + for element in values: + if self.points or self.heatPoints: + x = element[0] + y = element[1] + else: + x = element + if x not in line: + y = self.fill + else: + y = line[x] + + if self.minimumX != None and x < self.minimumX: + continue + if self.maximumX != None and x > self.maximumX: + continue + + if x == None: + raise Exception("Problem! x is None. Aborting...") + if y == None: + raise Exception("Problem! y is None. Aborting...") + if x == 0 and self.logX: + x = minPositiveValue + if y == 0 and self.logY: + y = minPositiveValue + if self.xMin == None: + if not self.logX or x != 0: + self.xMin = x + else: + if not self.logX or x != 0: + self.xMin = min(self.xMin, x) + if self.xMax == None: + self.xMax = x + else: + self.xMax = max(self.xMax, x) + if self.yMin == None: + if not self.logY or y != 0: + self.yMin = y + else: + if not self.logY or y != 0: + if y != "NA": + self.yMin = min(self.yMin, y) + if self.yMax == None: + self.yMax = y + else: + if y != "NA": + self.yMax = max(self.yMax, y) + + plot.append((x, y)) + + # cluster the data into buckets + if self.bucket != None: + buckets = dict([((int(value) / int(self.bucket)) * self.bucket, 0) for value in xrange(min(line.keys()), max(line.keys())+1)]) + for distance, nb in line.iteritems(): + buckets[(int(distance) / int(self.bucket)) * self.bucket] += nb + self.yMax = max(buckets.values()) + plot = [] + for x, y in buckets.iteritems(): + plot.append((x, y)) + + # write file + dataFileName = "tmpData-%d-%d.dat" % (self.seed, len(self.lines)) + dataHandle = open(dataFileName, "w") + if not self.heatPoints: + plot.sort() + for (x, y) in plot: + if y != "NA": + dataHandle.write("%f\t%f\n" % (x, y)) + else: + dataHandle.write("%f\t%s\n" % (x, y)) + dataHandle.close() + + self.lines.append(line) + self.names.append(name) + + if color == None: + colorNumber = len(self.colors) % (self.nbColors - 1) + 1 + type = "solid" + if len(self.colors) >= self.nbColors: + type = "dashed" + color = "colorPanel[%d]" % (colorNumber) + else: + color = "\"%s\"" % (color) + type = "solid" + self.colors.append(color) + self.types.append(type) + + + def addHeatLine(self, line, name = "", color = None): + """ + Add the heat line + @param line: the line which gives the color of the points + @type line: dict + """ + if not self.heatPoints: + raise Exception("Error! Trying to add a heat point whereas not mentioned to earlier! Aborting.") + + dataFileName = "tmpData-%d-%d.dat" % (self.seed, len(self.lines)) + dataHandle = open(dataFileName, "w") + + minimumHeat = min(line.values()) + maximumHeat = max(line.values()) + minLogValue = 0.00001 + log = self.logZ + + if log: + if minimumHeat == 0: + for element in line: + line[element] += minLogValue + minimumHeat += minLogValue + maximumHeat += minLogValue + minimumHeat = math.log10(minimumHeat) + maximumHeat = math.log10(maximumHeat) + + coeff = 255.0 / (maximumHeat - minimumHeat) + + for element in line: + value = line[element] + if log: + value = math.log10(max(minLogValue, value)) + dataHandle.write("\"#%02X%02X00\"\n" % (int((value - minimumHeat) * coeff), 255 - int((value - minimumHeat) * coeff))) + + dataHandle.close() + self.names.append(name) + if color == None: + colorNumber = len(self.colors) % (self.nbColors - 1) + 1 + type = "solid" + if len(self.colors) >= self.nbColors: + type = "dashed" + color = "colorPanel[%d]" % (colorNumber) + else: + color = "\"%s\"" % (color) + type = "solid" + self.colors.append(color) + self.types.append(type) + + + def getScript(self): + """ + Write (unfinished) R script + """ + script = "" + + xMin = self.xMin - self.leftMargin + if self.minimumX != None: + xMin = max(xMin, self.minimumX) + xMax = self.xMax + self.rightMargin + if self.maximumX != None: + xMax = min(xMax, self.maximumX) + yMin = self.yMin - self.bottomMargin + if self.minimumY != None: + yMin = self.minimumY + yMax = self.getNewYMaxWithTopMargin() + if self.maximumY != None: + yMax = self.maximumY + + log = "" + if self.logX: + log += "x" + if self.logY: + log += "y" + if log != "": + log = ", log=\"%s\"" % (log) + + title = "" + if self.title != None: + title = ", main = \"%s\"" % (self.title) + + if self.legend and self.legendBySide: + script += "layout(matrix(c(1,2), 1, 2), widths=c(5,1))\n" + + if self.rotateAxesLabels[2]: + script += "par(mar=c(5,12,4,2))\n" + else: + script += "par(mar=c(5,5,4,2))\n" + + addAxes = True + + if self.barplot: + script += "data = scan(\"tmpData-%d-0.dat\", list(x = -666, y = -666))\n" % (self.seed) + if len(self.lines) == 1: + script += "barplot(data$y, name = data$x, xlab=\"%s\", ylab=\"%s\", ylim = c(%f, %f), cex.axis = 2, cex.names = 2, cex.lab = 2%s%s)\n" % (self.xLabel, self.yLabel, yMin, yMax, title, log) + addAxes = False + else: + script += "data1 = scan(\"tmpData-%d-1.dat\", list(x = -666, y = -666))\n" % (self.seed) + script += "barplot(rbind(data$y, data1$y), name = data$x, xlab=\"%s\", ylab=\"%s\", cex.axis = 2, cex.names = 2, cex.lab = 2%s, beside = TRUE, space=c(-1,0), axes = FALSE%s)\n" % (self.xLabel, self.yLabel, title, log) + elif self.points: + script += "data = scan(\"tmpData-%d-0.dat\", list(x = -666, y = -666))\n" % (self.seed) + script += "plot(data$x, data$y, xlab=\"%s\", ylab=\"%s\", cex.axis = 2, cex.lab = 2, axes = FALSE%s%s)\n" % (self.xLabel, self.yLabel, title, log) + if self.regression: + x = "log10(data$x)" if self.logX else "data$x" + y = "log10(data$y)" if self.logY else "data$y" + script += "abline(lm(%s ~ %s))\n" % (y, x) + elif self.heatPoints: + if len(self.lines) != 1: + raise Exception("Error! Bad number of input data! Aborting...") + script += "data = scan(\"tmpData-%d-0.dat\", list(x = -666, y = -666))\n" % (self.seed) + script += "heatData = scan(\"tmpData-%d-1.dat\", list(x = \"\"))\n" % (self.seed) + script += "plot(data$x, data$y, col=heatData$x, xlab=\"%s\", ylab=\"%s\", cex.axis = 2, cex.lab = 2, axes = FALSE%s%s)\n" % (self.xLabel, self.yLabel, title, log) + if self.regression: + x = "log10(data$x)" if self.logX else "data$x" + y = "log10(data$y)" if self.logY else "data$y" + script += "abline(lm(%s ~ %s))\n" % (y, x) + else: + script += "plot(x = NA, y = NA, panel.first = grid(lwd = 1.0), xlab=\"%s\", ylab=\"%s\", xlim = c(%f, %f), ylim = c(%f, %f), cex.axis = 2, cex.lab = 2, axes = FALSE%s%s)\n" % (self.xLabel, self.yLabel, xMin, xMax, yMin, yMax, title, log) + for i in range(0, len(self.lines)): + script += "data = scan(\"tmpData-%d-%d.dat\", list(x = -666.666, y = -666.666))\n" % (self.seed, i) + script += "lines(x = data$x, y = data$y, col = %s, lty = \"%s\", lwd = %d)\n" % (self.colors[i], self.types[i], self.lineWidth) + + script += self.linesToAddBox + + if addAxes: + for i in self.axesLabels: + rotation = ", las = 2" if self.rotateAxesLabels[i] else "" + if self.axesLabels[i] == None: + script += "axis(%d, cex.axis = 2, cex.lab = 2%s)\n" % (i, rotation) + else: + oldKeys = ", ".join(["%d" % (key) for key in sorted(self.axesLabels[i].keys())]) + newKeys = ", ".join(["\"%s\"" % (self.axesLabels[i][key]) for key in sorted(self.axesLabels[i].keys())]) + script += "axis(%d, at=c(%s), lab=c(%s), cex.axis = 2, cex.lab = 2%s)\n" % (i, oldKeys, newKeys, rotation) + script += "box()\n" + + if self.legend: + if self.legendBySide: + script += "plot.new()\n" + script += "par(mar=c(0,0,0,0))\n" + script += "plot.window(c(0,1), c(0,1))\n" + script += "legends = c(%s)\n" % ", ".join(["\"%s\"" % name for name in self.names]) + script += "colors = c(%s)\n" % ", ".join(["%s" % color for color in self.colors]) + script += "lineTypes = c(%s)\n" % ", ".join(["\"%s\"" % type for type in self.types]) + if self.legendBySide: + script += "legend(0, 1, legend = legends, xjust = 0, yjust = 1, col = colors, lty = lineTypes, lwd = %d, cex = 1.5, ncol = 1, bg = \"white\")\n" % (self.lineWidth) + else: + script += "legend(\"topright\", legend = legends, xjust = 0, yjust = 1, col = colors, lty = lineTypes, lwd = %d, cex = 1.5, ncol = 1, bg = \"white\")\n" % (self.lineWidth) + + return script + + + + def plot(self): + """ + Plot the lines + """ + scriptFileName = "tmpScript-%d.R" % (self.seed) + scriptHandle = open(scriptFileName, "w") + scriptHandle.write("library(RColorBrewer)\n") + scriptHandle.write("colorPanel = brewer.pal(n=%d, name=\"Set1\")\n" % (self.nbColors)) + scriptHandle.write("%s(%s = \"%s\", width = %d, height = %d, bg = \"white\")\n" % (self.format, "filename" if self.format != "pdf" else "file", self.fileName, self.width, self.height)) + scriptHandle.write(self.getScript()) + scriptHandle.write("dev.off()\n") + scriptHandle.close() + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + self.keep = True + raise Exception("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + + def getCorrelationData(self): + if not self.regression: + return "" + scriptFileName = "tmpScript-%d.R" % (self.seed) + rScript = open(scriptFileName, "w") + rScript.write("data = scan(\"tmpData-%d-0.dat\", list(x = -0.000000, y = -0.000000))\n" % (self.seed)) + x = "log10(data$x)" if self.logX else "data$x" + y = "log10(data$y)" if self.logY else "data$y" + rScript.write("summary(lm(%s ~ %s))\n" % (y, x)) + rScript.close() + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + if status != 0: + self.keep = True + raise Exception("Problem with the execution of script file %s computing the correlation, status is: %s" % (scriptFileName, status)) + outputRFile = open("%sout" % (scriptFileName)) + output = "" + start = False + end = False + for line in outputRFile: + if start and "> " in line: + end = True + if start and not end: + output += line + if "summary" in line: + start = True + return output + + + def getSpearmanRho(self): + """ + Get the Spearman rho correlation using R + """ + return None + if not self.points and not self.barplot and not self.heatPoints: + raise Exception("Cannot compute Spearman rho correlation whereas not in 'points' or 'bar' mode.") + + scriptFileName = "tmpScript-%d.R" % (self.seed) + rScript = open(scriptFileName, "w") + rScript.write("library(Hmisc)\n") + rScript.write("data = scan(\"tmpData-%d-0.dat\", list(x = -0.000000, y = -0.000000))\n" % (self.seed)) + rScript.write("spearman(data$x, data$y)\n") + rScript.close() + + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + self.keep = True + raise Exception("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + outputRFile = open("%sout" % (scriptFileName)) + nextLine = False + for line in outputRFile: + line = line.strip() + if nextLine: + if line == "NA": + return None + return float(line) + nextLine = False + if line == "rho": + nextLine = True + + return None diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/UnlimitedProgress.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/misc/UnlimitedProgress.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,81 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import time + +class UnlimitedProgress(object): + """Show the progress of a process when no upper bound is known""" + + def __init__(self, step = 1000, message = "Progress", verbosity = 0): + self.step = step + self.progress = 0 + self.message = message + self.verbosity = verbosity + self.maxMessageSize = 50 + self.startTime = time.time() + self.elapsed = 0 + if len(self.message) > self.maxMessageSize: + self.message = self.message[0:self.maxMessageSize-3] + "..." + self.show() + + + def inc(self): + self.progress += 1 + self.show() + + + def getPrintableElapsedTime(self, time): + timeHou = int(time) / 3600 + timeMin = int(time) / 60 - 60 * timeHou + timeSec = int(time) % 60 + if timeHou > 0: + return "%3dh %2dm" % (timeHou, timeMin) + if timeMin > 0: + return "%2dm %2ds" % (timeMin, timeSec) + return "%2ds" % (timeSec) + + + def show(self): + if self.verbosity <= 0: + return + elapsed = int(time.time() - self.startTime) + if (self.progress % self.step == 0) or (elapsed > self.elapsed + 10): + self.elapsed = elapsed + string = "%s %d -- time spent: %s\r" % (self.message, self.progress, self.getPrintableElapsedTime(elapsed)) + sys.stdout.write(string) + sys.stdout.flush() + + + def done(self): + if self.verbosity > 0: + elapsed = time.time() - self.startTime + string = "%s %d -- time spent: %s\r" % (self.message, self.progress, self.getPrintableElapsedTime(elapsed)) + print string + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/UnlimitedProgress.pyc Binary file smart_toolShed/SMART/Java/Python/misc/UnlimitedProgress.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/Utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/misc/Utils.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,271 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Some useful functions""" + +import sys, os +import random +import subprocess + + +def writeFile(fileName, content): + """ + Write the content of a file + """ + handle = open(fileName, "w") + handle.write(content) + handle.close() + +def sumOfLists(list1, list2): + """ + Element by element sum + """ + if len(list1) != len(list2): + sys.exit("Cannot sum list whose sizes are different!") + return [list1[i] + list2[i] for i in range(len(list1))] + + +def protectBackslashes(string): + """ + Protect the backslashes in a path by adding another backslash + """ + return string.replace("\\", "\\\\") + + +def getHammingDistance(string1, string2): + """ + Compute Hamming distance between two strings + """ + if len(string1) != len(string2): + raise Exception("Error, size of %s and %s differ" % (string1, string2)) + return sum(ch1 != ch2 for ch1, ch2 in zip(string1, string2)) + + +def getLevenshteinDistance(string1, string2): + """ + Compute Levenshtein distance between two strings + """ + if len(string1) < len(string2): + return getLevenshteinDistance(string2, string1) + if not string1: + return len(string2) + previousRow = xrange(len(string2) + 1) + for i, c1 in enumerate(string1): + currentRow = [i + 1] + for j, c2 in enumerate(string2): + insertions = previousRow[j + 1] + 1 + deletions = currentRow[j] + 1 + substitutions = previousRow[j] + (c1 != c2) + currentRow.append(min(insertions, deletions, substitutions)) + previousRow = currentRow + return previousRow[-1] + + +def getMinAvgMedMax(values): + """ + Get some stats about a dict + @param values: a distribution (the value being the number of occurrences of the key) + @type values: dict int to int + @return: a tuple + """ + minValues = min(values.keys()) + maxValues = max(values.keys()) + sumValues = sum([value * values[value] for value in values]) + nbValues = sum(values.values()) + allValues = [] + for key in values: + for i in range(values[key]): + allValues.append(key) + sortedValues = sorted(allValues) + sorted(values.values()) + if (nbValues % 2 == 0): + medValues = (sortedValues[nbValues / 2 - 1] + sortedValues[nbValues / 2]) / 2.0 + else: + medValues = sortedValues[(nbValues + 1) / 2 - 1] + return (minValues, float(sumValues) / nbValues, medValues, maxValues) + + +def xor(value1, value2): + """ + Logical xor + @param value1: a value + @type value1: anything + @param value2: a value + @type value2: anything + """ + return bool(value1) != bool(value2) + + +def diff(fileName1, fileName2): + """ + Compare two files + @param fileName1: a file name + @type fileName1: string + @param fileName2: another file name + @type fileName2: string + @return: None if the files are the same, a string otherwise + """ + handle1 = open(fileName1) + lines1 = handle1.readlines() + handle2 = open(fileName2) + lines2 = handle2.readlines() + if len(lines1) != len(lines2): + print "Sizes of files differ (%d != %d)" % (len(lines1), len(lines2)) + return False + for i in xrange(len(lines1)): + if lines1[i] != lines2[i]: + print "Line %d differ ('%s' != '%s')" % (i, lines1[i].strip(), lines2[i].strip()) + return False + return True + + +def binomialCoefficient(a, b): + """ + Compute cumulated product from a to b + @param a: a value + @type a: int + @param b: a value + @type b: int + """ + if a > b / 2: + a = b-a + p = 1.0 + for i in range(b-a+1, b+1): + p *= i + q = 1.0 + for i in range(1, a+1): + q *= i + return p / q + + +memory = {} + +# def fisherExactPValue(a, b, c, d): +# """ +# P-value of Fisher exact test for 2x2 contingency table +# """ +# if (a, b, c, d) in memory: +# return memory[(a, b, c, d)] + +# n = a + b + c + d +# i1 = binomialCoefficient(a, a+b) +# i2 = binomialCoefficient(c, a+c) +# i3 = binomialCoefficient(c+d, n) +# pValue = i1 * i2 / i3 + +# memory[(a, b, c, d)] = pValue + +# return pValue + + +def fisherExactPValue(a, b, c, d): + if (a, b, c, d) in memory: + return memory[(a, b, c, d)] + + scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) + rScript = open(scriptFileName, "w") + rScript.write("data = matrix(c(%d, %d, %d, %d), nr=2)\n" % (a, b, c, d)) + rScript.write("fisher.test(data)\n") + #rScript.write("chisq.test(data)\n") + rScript.close() + + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + outputRFileName = "%sout" % (scriptFileName) + outputRFile = open(outputRFileName) + pValue = None + pValueTag = "p-value " + for line in outputRFile: + line = line.strip() + if line == "": continue + for splittedLine in line.split(","): + splittedLine = splittedLine.strip() + if splittedLine.startswith(pValueTag): + pValue = float(splittedLine.split()[-1]) + break + + if pValue == None: + sys.exit("Problem with the cannot find p-value! File %s, values are: %d, %d, %d, %d" % (scriptFileName, a, b, c, d)) + + os.remove(scriptFileName) + os.remove(outputRFileName) + + memory[(a, b, c, d)] = pValue + + return pValue + + +def fisherExactPValueBulk(list): + + scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) + rScript = open(scriptFileName, "w") + for element in list: + rScript.write("fisher.test(matrix(c(%d, %d, %d, %d), nr=2))$p.value\n" % (int(element[0]), int(element[1]), int(element[2]), int(element[3]))) + rScript.close() + + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + outputRFileName = "%sout" % (scriptFileName) + outputRFile = open(outputRFileName) + pValue = None + pValueTag = "[1] " + results = {} + cpt = 0 + for line in outputRFile: + line = line.strip() + if line == "": continue + if line.startswith(pValueTag): + pValue = float(line.split()[-1]) + results[list[cpt][0:2]] = pValue + cpt += 1 + + if pValue == None: + sys.exit("Problem with the cannot find p-value!") + if cpt != len(list): + sys.exit("Error in the number of p-values computed by R in file '%s'!" % (scriptFileName)) + + os.remove(scriptFileName) + os.remove(outputRFileName) + + return results + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/Utils.pyc Binary file smart_toolShed/SMART/Java/Python/misc/Utils.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/__init__.py diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/__init__.pyc Binary file smart_toolShed/SMART/Java/Python/misc/__init__.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/test/Test_Utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/misc/test/Test_Utils.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,13 @@ +import unittest +from SMART.Java.Python.misc import Utils + + +class Test_Utils(unittest.TestCase): + + def testFisherExactPValue(self): + self.assertAlmostEqual(Utils.fisherExactPValue(3, 1, 1, 3), 0.4857142857142842, 3) + + +if __name__ == '__main__': + unittest.main() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/misc/test/__init__.py diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/modifyFasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/modifyFasta.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,62 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Modify the content of a FASTA file""" + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.writer.FastaWriter import FastaWriter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Modify Sequence List v1.0.1: Extend or shring a list of sequences. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="keep first nucleotides [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="keep last nucleotides [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + progress = Progress(parser.getNbSequences(), "Reading file %s" % (options.inputFileName), options.verbosity) + for sequence in parser.getIterator(): + if options.start != None: + sequence.shrinkToFirstNucleotides(options.start) + if options.end != None: + sequence.shrinkToLastNucleotides(options.end) + writer.addSequence(sequence) + progress.inc() + progress.done() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/modifyGenomicCoordinates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/modifyGenomicCoordinates.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,80 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Modify the genomic coordinates of a file""" + +from optparse import OptionParser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Modify Genomic Coordinates v1.0.1: Extend or shrink a list of genomic coordinates. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="restrict to the start of the transcript [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="restrict to the end of the transcript [format: int]") + parser.add_option("-5", "--fivePrime", dest="fivePrime", action="store", default=None, type="int", help="extend to the 5' direction [format: int]") + parser.add_option("-3", "--threePrime", dest="threePrime", action="store", default=None, type="int", help="extend to the 3' direction [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + writer = TranscriptWriter(options.outputFileName, "gff3", options.verbosity) + + nbItems = 0 + nbItems = parser.getNbItems() + print "%i items found" % (nbItems) + + progress = Progress(nbItems, "Analyzing sequences of " + options.inputFileName, options.verbosity) + for transcript in parser.getIterator(): + if options.start != None: + transcript.restrictStart(options.start) + if options.end != None: + transcript.restrictEnd(options.end) + if options.fivePrime != None: + transcript.extendStart(options.fivePrime) + if options.threePrime != None: + transcript.extendEnd(options.threePrime) + + writer.addTranscript(transcript) + + progress.inc() + progress.done() + + writer.write() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/modifySequenceList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/modifySequenceList.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,72 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Modify the content of a FASTA file""" +import sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Modify Sequence List v1.0.1: Extend or shring a list of sequences. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName",action="store", type="string", help="input file [compulsory] [format: file in format given by -f]") + parser.add_option("-o", "--output", dest="outputFileName", action="store",default=None, type="string", help="output file [compulsory] [format: output file in format given by -f]") + parser.add_option("-f", "--format", dest="format",action="store",type="string", help="format of the file [compulsory] [format: sequence file format]") + parser.add_option("-s", "--start", dest="start", action="store", default=None,type="int",help="keep first nucleotides [format: int]") + parser.add_option("-e", "--end", dest="end", action="store",default=None,type="int",help="keep last nucleotides [format: int]") + parser.add_option("-v", "--verbosity",dest="verbosity",action="store",default=1,type="int",help="trace level [format: int]") + (options, args) = parser.parse_args() + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + sys.exit("Do not understand '%s' file format." % (options.format)) + + progress = Progress(parser.getNbSequences(), "Reading file %s" % (options.inputFileName), options.verbosity) + for sequence in parser.getIterator(): + if options.start != None: + sequence.shrinkToFirstNucleotides(options.start) + if options.end != None: + sequence.shrinkToLastNucleotides(options.end) + writer.addSequence(sequence) + progress.inc() + progress.done() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/MySqlConnection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mySql/MySqlConnection.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,109 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +#! /usr/bin/env python +import os +import random +import sqlite3 +from SMART.Java.Python.mySql.MySqlQuery import MySqlQuery + + +class MySqlConnection(object): + """Connection to a database""" + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.databaseName = "%s%ssmartdb%d" % (os.environ.get("SMARTTMPPATH", "."), os.sep, random.randint(0, 100000)) + self.connection = sqlite3.connect(self.databaseName) + self.executeQuery("PRAGMA journal_mode = OFF") + self.executeQuery("PRAGMA synchronous = 0") + self.executeQuery("PRAGMA locking_mode = EXCLUSIVE") + self.executeQuery("PRAGMA count_change = OFF") + self.executeQuery("PRAGMA temp_store = 2") + + def __del__(self): + self.connection.close() + + + def createDatabase(self): + pass + + + def deleteDatabase(self): + if os.path.exists(self.databaseName): + os.remove(self.databaseName) + + + def executeQuery(self, command, insertion = False): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + result = query.execute(command, insertion) + self.connection.commit() + except: + result = query.execute(command, insertion) + self.connection.commit() + if insertion: + return result + else: + return query + + + def executeManyQueries(self, commands): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + for cpt, command in enumerate(commands): + query.execute(command) + self.connection.commit() + except: + for cpt, command in enumerate(commands): + query.execute(command) + self.connection.commit() + + + def executeManyQueriesIterator(self, table): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + for command in table.getIterator(): + query.execute(command) + self.connection.commit() + except: + for command in table.getIterator(): + query.execute(command) + self.connection.commit() + + + def executeFormattedQuery(self, command, *parameters): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + query.executeFormat(command, parameters) + self.connection.commit() + return query diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/MySqlExonTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mySql/MySqlExonTable.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,97 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.mySql.MySqlTable import MySqlTable + + +class MySqlExonTable(MySqlTable): + """A table of exon in a mySQL database""" + + def __init__(self, connection, name = None, chromosome = None, verbosity = 0): + if chromosome == None: + chromosome = "" + else: + chromosome = "_%s" % chromosome + if name == None: + name = "TmpTable_%d" % (random.randint(0, 100000)) + name = "%s%s_exons" % (name, chromosome) + super(MySqlExonTable, self).__init__(connection, name, verbosity) + + + def createExonTable(self): + variables = Interval.getSqlVariables() + variables.append("transcriptId") + types = Interval.getSqlTypes() + types["transcriptId"] = "int" + sizes = Interval.getSqlSizes() + sizes["transcriptId"] = 11 + self.create(variables, types, sizes) + + + def rename(self, name): + super(MySqlExonTable, self).rename("%s_exons" % name) + + + def addExon(self, exon, transcriptId): + values = exon.getSqlValues() + values["transcriptId"] = transcriptId + id = self.addLine(values) + exon.id = id + + + def retrieveExonsFromTranscriptId(self, transcriptId): + if not self.created: + return [] + query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE transcriptId = %d" % (self.name, transcriptId)) + exons = [] + for exonLine in query.getIterator(): + exon = Interval() + exon.setSqlValues(exonLine) + exons.append(exon) + return exons + + + def retrieveExonsFromBulkTranscriptIds(self, transcriptIds): + if not transcriptIds: + return {} + if not self.created: + return {} + exons = dict([(transcriptId, []) for transcriptId in transcriptIds]) + query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE transcriptId IN (%s)" % (self.name, ", ".join(["%s" % (transcriptId) for transcriptId in transcriptIds]))) + for exonLine in query.getIterator(): + exon = Interval() + exon.setSqlValues(exonLine) + exons[exonLine[-1]].append(exon) + return exons + + + def removeFromTranscriptId(self, transcriptId): + self.mySqlConnection.executeQuery("DELETE FROM %s WHERE transcriptId = %d" % (self.name, transcriptId)) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/MySqlExonTable.pyc Binary file smart_toolShed/SMART/Java/Python/mySql/MySqlExonTable.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/MySqlQuery.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mySql/MySqlQuery.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,94 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +class MySqlQuery(object): + """Query to a database""" + + def __init__(self, cursor, verbosity = 0): + self.verbosity = verbosity + self.cursor = cursor + self.insertedId = None + + + def __del__(self): + self.cursor.close() + + + def execute(self, query, insertion = False): + if self.verbosity > 99: + print "Querying %s" % (query) + try: + results = self.cursor.execute(query) + except Exception: + raise Exception("Error! Command \"%s\" failed!" % (query)) + if insertion: + return self.cursor.lastrowid + return results + + + def executeFormat(self, query, parameters): + if self.verbosity > 99: + print "Querying %s |" % (query), + for parameter in parameters: + print parameter, + print + results = self.cursor.execute(query, parameters) + return results + + + def getLine(self): + return self.cursor.fetchone() + + + def getLines(self, lines = None): + if lines == None: + return self.cursor.fetchall() + return self.cursor.fetchmany(lines) + + + def isEmpty(self): + self.getLines() + return self.cursor.rowcount == None or self.cursor.rowcount == 0 + + + def getInsertedId(self): + return self.insertedId + + + def getIterator(self): + line = self.getLine() + while line != None: + yield line + line = self.getLine() + + + def show(self): + for line in self.getIterator(): + print line diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/MySqlTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mySql/MySqlTable.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,334 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys + +class MySqlTable(object): + """ + Store a table of a mySQL database, used for transcripts or exons + Record a a name and a type (int, float, double) for each column + @ivar name: name of the table + @type name: string + @ivar variables: name of the columns + @type variables: list of string + @ivar types: type of the columns + @type types: dict of string + @ivar mySqlConnection: connection to a database + @type mySqlConnection: class L{MySqlConnection} + @ivar nbLines: number of rows + @type nbLines: int + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, connection, name, verbosity = 0): + """ + Constructor + Possibly retrieve column names and types if table exists + @param mySqlConnection: connection to a databas + @type mySqlConnection: class L{MySqlConnection} + @param name: name of the table + @type name: string + @param verbosity: verbosity + @type verbosity: int + """ + self.name = name + self.variables = [] + self.types = {} + self.sizes = {} + self.nbLines = None + self.verbosity = verbosity + self.mySqlConnection = connection + queryTables = self.mySqlConnection.executeQuery("SELECT name FROM sqlite_master WHERE type LIKE 'table' AND name LIKE '%s'" % (self.name)) + self.created = not queryTables.isEmpty() + if self.created: + queryFields = self.mySqlConnection.executeQuery("PRAGMA table_info('%s')" % (name)) + for field in queryFields.getIterator(): + if field[1] != "id": + self.variables.append(field[1]) + self.types[field[1]] = field[2] + self.sizes[field[1]] = field[3] + + + def getName(self): + return self.name + + + def create(self, variables, types, sizes): + """ + Create a table using give column names and types + @param variables: names of the columns + @type variables: list of string + @param types: types of the columns + @type types: dict of string + @param sizes: sizes of the types + @type sizes: dict of int + """ + self.variables = variables + self.types = types + self.sizes = sizes + if self.created: + self.remove() + query = "CREATE TABLE '%s' (id INTEGER PRIMARY KEY" % (self.name) + for variable in variables: + query = "%s, %s %s(%d)" % (query, variable, types[variable], sizes[variable]) + query += ")" + self.mySqlConnection.executeQuery(query) + self.created = True + + + def insertMany(self, lines): + """ + Insert many lines + @param lines: the list of values + @type lines: list of lists + """ + commands = [] + for values in lines: + commands.append("INSERT INTO '%s' (%s) VALUES (%s)" % (self.name, ", ".join(self.variables), ", ".join([MySqlTable.formatSql(values[variable], self.types[variable], self.sizes[variable]) for variable in self.variables]))) + self.mySqlConnection.executeManyQueries(commands) + + + def rename(self, name): + """ + Rename the table + @param name: the new name + @type name: string + """ + self.mySqlConnection.executeQuery("RENAME TABLE '%s' TO '%s'" % (self.name, name)) + self.name = name + + + def copy(self, table): + """ + Copy the given table this one + @param table: the table to be copied + @type table: class L{MySqlTable} + """ + variables = [] + types = {} + sizes = {} + fields = self.mySqlConnection.executeQuery("PRAGMA table_info(%s)" % (table.name)) + for field in fields.getIterator(): + if field[1] != "id": + variables.append(field[1]) + m = re.search(r"(\w+)\((\d+)\)", field[2]) + if m == None: + raise Exception("\nFormat %s in table %s is strange." % (field[2], table.name)) + types[field[1]] = m.group(1) + sizes[field[1]] = int(m.group(2)) + self.create(variables, types, sizes) + self.mySqlConnection.executeQuery("INSERT INTO '%s' SELECT * FROM %s" % (self.name, table.name)) + + + def add(self, table): + """ + Add the content of a table to this one + @param table: the table to be added + @type table: class L{MySqlTable} + """ + self.mySqlConnection.executeQuery("INSERT INTO '%s' SELECT * FROM %s" % (self.name, table.name)) + self.created = True + + + def exists(self): + """ + Check if the table exists in mySQL + @return: true if it exits + """ + return self.created + + + def remove(self): + """ + Remove this table + """ + if self.exists(): + query = "DROP TABLE IF EXISTS '%s'" % (self.name) + self.mySqlConnection.executeQuery(query) + self.created = False + + + def clear(self): + """ + Clear the content of this table + """ + self.mySqlConnection.executeQuery("DELETE FROM '%s'" % (self.name)) + + + def getNbElements(self): + """ + Count the number of rows in the table + """ + command = "SELECT COUNT(id) FROM '%s'" % (self.name) + query = self.mySqlConnection.executeQuery(command) + return int(query.getLine()[0]) + + + def formatSql(self, value, type, size): + """ + Format a value using MySQL encapsulation + """ + if type.find("int") != -1: + return "%d" % value + if type.find("float") != -1: + return "%.10f" % value + if type.find("double") != -1: + return "%.20f" % value + if type.find("varchar") != -1: + if len(value) > size: + return "'%s'" % value[0:size] + return "'%s'" % value + raise Exception("Do not understand type %s" % (type)) + formatSql = classmethod(formatSql) + + + def addLine(self, values): + """ + Add a row to this table + @param values: the values of the row + @type values: dict + @return: the id of the added row + """ + sqlValues = [] + for variable in self.variables: + sqlValues.append(self.formatSql(values[variable], self.types[variable], self.sizes[variable])) + command = "INSERT INTO '%s' (%s) VALUES (%s)" % (self.name, ", ".join(self.variables), ", ".join(sqlValues)) + id = self.mySqlConnection.executeQuery(command, True) + return id + + + def retrieveFromId(self, id): + """ + Retrieve a row from its id + @param id: the id of the row + @type id: int + @return: the row + """ + query = self.mySqlConnection.executeQuery("SELECT * FROM '%s' WHERE id = %d" % (self.name, id)) + result = query.getLine() + if result == None: + raise Exception("Error! Id %d is not in the table %s!" % (id, self.name)) + return result + + + def retrieveBulkFromId(self, ids): + """ + Retrieve a row from its id + @param id: the ids of the row + @type id: list of int + @return: the row + """ + if not ids: + return [] + MAXSIZE = 1000 + results = [] + for batch in range(len(ids) / MAXSIZE + 1): + theseIds = ids[batch * MAXSIZE : (batch+1) * MAXSIZE] + if theseIds: + query = self.mySqlConnection.executeQuery("SELECT * FROM '%s' WHERE id IN (%s)" % (self.name, ", ".join(["%d" % (id) for id in theseIds]))) + lines = query.getLines() + if len(lines) != len(theseIds): + raise Exception("Error! Some Ids of (%s) is are missing in the table '%s' (got %d instead of %d)!" % (", ".join(["%d" % (id) for id in theseIds]), self.name, len(lines)), len(theseIds)) + results.extend(lines) + return results + + + def removeFromId(self, id): + """ + Remove a row from its id + @param id: the id of the row + @type id: int + """ + self.mySqlConnection.executeQuery("DELETE FROM '%s' WHERE id = %d" % (self.name, id)) + + + def getIterator(self): + """ + Iterate on the content of table + @return: iterator to the rows of the table + """ + if not self.created: + return + MAXSIZE = 1000 + query = self.mySqlConnection.executeQuery("SELECT count(id) FROM '%s'" % (self.name)) + nbRows = int(query.getLine()[0]) + for chunk in range((nbRows / MAXSIZE) + 1): + query = self.mySqlConnection.executeQuery("SELECT * FROM '%s' LIMIT %d, %d" % (self.name, chunk * MAXSIZE, MAXSIZE)) + for line in query.getIterator(): + yield line + + + def createIndex(self, indexName, values, unique = False, fullText = False): + """ + Add an index on the table + @param indexName: name of the index + @type indexName: string + @param values: values to be indexed + @type values: string + @param unique: if the index is unique + @type unique: boolean + @param fullText: whether full text should be indexed + @type fullText: boolean + """ + self.mySqlConnection.executeQuery("CREATE %s%sINDEX '%s' ON '%s' (%s)" % ("UNIQUE " if unique else "", "FULLTEXT " if fullText else "", indexName, self.name, ", ".join(values))) + + + def setDefaultTagValue(self, field, name, value): + """ + Add a tag value + @param name: name of the tag + @type name: string + @param value: value of the tag + @type value: string or int + """ + newData = {} + for line in MySqlTable.getIterator(self): + id = line[0] + tags = line[field] + if tags == '': + newTag = "%s=%s" % (name, value) + else: + newTag = "%s;%s=%s" % (tags, name, value) + if name not in [tag.split("=")[0] for tag in tags.split(";")]: + newData[id] = newTag + for id, tag in newData.iteritems(): + query = self.mySqlConnection.executeQuery("UPDATE '%s' SET tags = '%s' WHERE id = %i" % (self.name, tag, id)) + + + + def show(self): + """ + Drop the content of the current table + """ + query = self.mySqlConnection.executeQuery("SELECT * FROM '%s'" % (self.name)) + print query.getLines() + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/MySqlTable.pyc Binary file smart_toolShed/SMART/Java/Python/mySql/MySqlTable.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/MySqlTranscriptTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mySql/MySqlTranscriptTable.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,149 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +import sys +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.mySql.MySqlExonTable import MySqlExonTable +from SMART.Java.Python.mySql.MySqlTable import MySqlTable +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +class MySqlTranscriptTable(MySqlTable): + """A table of transcripts in a mySQL database""" + + def __init__(self, connection, name = None, chromosome = None, verbosity = 0): + if chromosome == None: + chromosome = "" + else: + chromosome = "_%s" % chromosome + if name == None: + name = "TmpTable_%d" % (random.randint(0, 100000)) + name = "%s%s" % (name, chromosome) + super(MySqlTranscriptTable, self).__init__(connection, "%s_transcripts" % name, verbosity) + + + def createTranscriptTable(self): + self.create(Transcript.getSqlVariables(), Transcript.getSqlTypes(), Transcript.getSqlSizes()) + + + def rename(self, name): + super(MySqlTranscriptTable, self).rename("%s_transcripts" % name) + + + def remove(self): + super(MySqlTranscriptTable, self).remove() + + + def clear(self): + super(MySqlTranscriptTable, self).clear() + + + def copy(self, transcriptTable): + self.remove() + super(MySqlTranscriptTable, self).copy(transcriptTable) + + + def add(self, transcriptTable): + super(MySqlTranscriptTable, self).add(transcriptTable) + + + def addTranscript(self, transcript): + id = self.addLine(transcript.getSqlValues()) + transcript.id = id + + + def addTranscriptList(self, transcriptList): + progress = Progress(transcriptList.getNbTranscript(), "Storing list to %s" % (self.name), self.verbosity) + for transcript in transcriptList.getIterator(): + self.addTranscript(transcript) + progress.inc() + progress.done() + + + def removeTranscript(self, transcript): + self.removeFromId(transcript.id) + + + def retrieveTranscriptFromId(self, id): + transcript = Transcript() + transcript.setSqlValues(self.retrieveFromId(id)) + return transcript + + + def retrieveBulkTranscriptFromId(self, ids): + if not ids: + return [] + transcripts = self.retrieveBulkFromId(ids) + idsToTranscripts = {} + for values in transcripts: + transcript = Transcript() + transcript.setSqlValues(values) + idsToTranscripts[values[0]] = transcript + return idsToTranscripts.values() + + + def selectTranscripts(self, command, simple = False): + MAXSIZE = 100000 + found = True + cpt = 0 + while found: + found = False + if simple: + thisCommand = command + else: + thisCommand = "%s LIMIT %d OFFSET %d" % (command, MAXSIZE, MAXSIZE * cpt) + query = self.mySqlConnection.executeQuery(thisCommand) + for line in query.getIterator(): + found = True + id = int(line[0]) + transcript = Transcript() + transcript.setSqlValues(line) + yield (id, transcript) + cpt += 1 + if simple: + return + + + def getIterator(self): + for id, transcript in self.selectTranscripts("SELECT * FROM '%s'" % (self.name)): + yield transcript + + + def retrieveTranscriptList(self): + transcriptList = TranscriptList() + for transcriptLine in self.getLines(): + transcript = Transcript() + transcript.setSqlValues(transcriptLine) + transcriptList.addTranscript(transcript) + return transcriptList + + + def setDefaultTagValue(self, name, value): + super(MySqlTranscriptTable, self).setDefaultTagValue(Transcript.getSqlVariables().index("tags")+1, name, value) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/MySqlTranscriptTable.pyc Binary file smart_toolShed/SMART/Java/Python/mySql/MySqlTranscriptTable.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/__init__.py diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/__init__.pyc Binary file smart_toolShed/SMART/Java/Python/mySql/__init__.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/test/Test_MySqlTranscriptTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/mySql/test/Test_MySqlTranscriptTable.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,158 @@ +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable +import unittest + + +class Test_MySqlTranscriptTable(unittest.TestCase): + + def test_getRange(self): + transcript = Transcript() + transcript.setName("test1.1") + transcript.setChromosome("arm_X") + transcript.setStart(1000) + transcript.setEnd(4000) + transcript.setSize(2000) + transcript.setDirection("+") + + exon1 = Interval() + exon1.setName("test1.1") + exon1.setChromosome("arm_X") + exon1.setStart(1000) + exon1.setEnd(2000) + exon1.setSize(1000) + + exon2 = Interval() + exon2.setName("test1.1") + exon2.setChromosome("arm_X") + exon2.setStart(3000) + exon2.setEnd(4000) + exon2.setSize(1000) + + transcript.addExon(exon1) + transcript.addExon(exon2) + + connection = MySqlConnection() + writer = MySqlTranscriptWriter(connection, "testMySqlTranscriptTableGetRange") + writer.addTranscript(transcript) + writer.write() + + transcriptContainer = TranscriptContainer("testMySqlTranscriptTableGetRange", "sql") + transcriptContainer.mySqlConnection = connection + self.assertEqual(transcriptContainer.getNbTranscripts(), 1) + for transcript in transcriptContainer.getIterator(): + self.assertEqual(transcript.getName(), "test1.1") + self.assertEqual(transcript.getChromosome(), "arm_X") + self.assertEqual(transcript.getStart(), 1000) + self.assertEqual(transcript.getEnd(), 4000) + self.assertEqual(transcript.getSize(), 2002) + self.assertEqual(transcript.getNbExons(), 2) + exons = transcript.getExons() + self.assertEqual(exons[0].getStart(), 1000) + self.assertEqual(exons[0].getEnd(), 2000) + self.assertEqual(exons[1].getStart(), 3000) + self.assertEqual(exons[1].getEnd(), 4000) + + + def test_setDefaultTagValue(self): + transcript1 = Transcript() + transcript1.setName("test1.1") + transcript1.setChromosome("arm_X") + transcript1.setStart(1000) + transcript1.setEnd(2000) + transcript1.setDirection("+") + + exon1 = Interval() + exon1.setName("test1.1") + exon1.setChromosome("arm_X") + exon1.setStart(1000) + exon1.setEnd(2000) + + transcript1.addExon(exon1) + + transcript2 = Transcript() + transcript2.setName("test2.1") + transcript2.setChromosome("arm_X") + transcript2.setStart(1000) + transcript2.setEnd(2000) + transcript2.setDirection("+") + transcript2.setTagValue("nbOccurrences", "2") + + exon2 = Interval() + exon2.setName("test2.1") + exon2.setChromosome("arm_X") + exon2.setStart(1000) + exon2.setEnd(2000) + + transcript2.addExon(exon2) + + transcript3 = Transcript() + transcript3.setName("test3.1") + transcript3.setChromosome("arm_X") + transcript3.setStart(1000) + transcript3.setEnd(2000) + transcript3.setDirection("+") + transcript3.setTagValue("occurrences", "2") + + exon3 = Interval() + exon3.setName("test3.1") + exon3.setChromosome("arm_X") + exon3.setStart(1000) + exon3.setEnd(2000) + + transcript3.addExon(exon3) + + connection = MySqlConnection() + table = MySqlTranscriptTable(connection, "testMySqlTranscriptTableSetDefaultTagValue") + table.createTranscriptTable() + table.addTranscript(transcript1) + table.addTranscript(transcript2) + table.addTranscript(transcript3) + table.setDefaultTagValue("occurrence", "1") + + cpt = 0 + for transcript in table.getIterator(): + cpt += 1 + self.assert_(cpt != 4) + if cpt == 1: + self.assertEqual(transcript.name, "test1.1") + self.assertEqual(transcript.getChromosome(), "arm_X") + self.assertEqual(transcript.getStart(), 1000) + self.assertEqual(transcript.getEnd(), 2000) + self.assertEqual(transcript.getSize(), 1001) + self.assertEqual(transcript.getNbExons(), 1) + exons = transcript.getExons() + self.assertEqual(exons[0].getStart(), 1000) + self.assertEqual(exons[0].getEnd(), 2000) + self.assertEqual(transcript.getTagValue("occurrence"), 1) + elif cpt == 2: + self.assertEqual(transcript.name, "test2.1") + self.assertEqual(transcript.getChromosome(), "arm_X") + self.assertEqual(transcript.getStart(), 1000) + self.assertEqual(transcript.getEnd(), 2000) + self.assertEqual(transcript.getSize(), 1001) + self.assertEqual(transcript.getNbExons(), 1) + exons = transcript.getExons() + self.assertEqual(exons[0].getStart(), 1000) + self.assertEqual(exons[0].getEnd(), 2000) + self.assertEqual(transcript.getTagValue("nbOccurrences"), 2) + self.assertEqual(transcript.getTagValue("occurrence"), 1) + elif cpt == 2: + self.assertEqual(transcript.name, "test3.1") + self.assertEqual(transcript.getChromosome(), "arm_X") + self.assertEqual(transcript.getStart(), 1000) + self.assertEqual(transcript.getEnd(), 2000) + self.assertEqual(transcript.getSize(), 1001) + self.assertEqual(transcript.getNbExons(), 1) + exons = transcript.getExons() + self.assertEqual(exons[0].getStart(), 1000) + self.assertEqual(exons[0].getEnd(), 2000) + self.assertEqual(transcript.getTagValue("occurrence"), 2) + + table.remove() + +if __name__ == '__main__': + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/mySql/test/__init__.py diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/ConvertToNCList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/ConvertToNCList.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,172 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import random, os, time, shutil +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.ncList.NCListMerger import NCListMerger +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +try: + import cPickle as pickle +except: + import pickle + +class ConvertToNCList(object): + + def __init__(self, verbosity = 1): + self._parsers = {} + self._sortedFileNames = {} + self._inputFileName = None + self._outputFileName = None + self._index = False + self._ncLists = {} + self._splittedFileNames = {} + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._randomNumber = random.randint(0, 10000) + self._sorted = False + self._verbosity = verbosity + + def setInputFileName(self, fileName, format): + self._inputFileName = fileName + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._parser = chooser.getParser(fileName) + + def setOutputFileName(self, fileName): + self._outputFileName = fileName + fileNameNoExtension = os.path.splitext(fileName)[0] + baseName = "%s_%d" % (fileNameNoExtension, self._randomNumber) + self._directory = "%s_files" % (baseName) + if not os.path.exists(self._directory): + os.makedirs(self._directory) + self._sortedFileNames = os.path.join(self._directory, baseName) + + def setIndex(self, boolean): + self._index = boolean + + def setSorted(self, boolean): + self._sorted = boolean + + def sortFile(self): + if self._verbosity > 2: + print "%s file %s..." % ("Rewriting" if self._sorted else "Sorting", self._inputFileName) + startTime = time.time() + fs = FileSorter(self._parser, self._verbosity-4) + fs.setPresorted(self._sorted) + fs.perChromosome(True) + fs.setOutputFileName(self._sortedFileNames) + fs.sort() + self._splittedFileNames = fs.getOutputFileNames() + self._nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self._nbElements = fs.getNbElements() + endTime = time.time() + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def createNCLists(self): + self._ncLists = {} + if self._verbosity > 2: + print "Creating NC-list for %s..." % (self._inputFileName) + startTime = time.time() + for chromosome, fileName in self._splittedFileNames.iteritems(): + if self._verbosity > 3: + print " chromosome %s" % (chromosome) + ncList = NCList(self._verbosity) + if self._index: + ncList.createIndex(True) + ncList.setChromosome(chromosome) + ncList.setFileName(fileName) + ncList.setNbElements(self._nbElementsPerChromosome[chromosome]) + ncList.buildLists() + self._ncLists[chromosome] = ncList + endTime = time.time() + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def writeOutputFile(self): + merger = NCListMerger(self._verbosity) + merger.setFileName(self._outputFileName) + merger.addIndex(self._index) + merger.setNCLists(self._ncLists) + merger.merge() + + def cleanFiles(self): + shutil.rmtree(self._directory) + + def run(self): + self.sortFile() + self.createNCLists() + self.writeOutputFile() + self.cleanFiles() + + def getSortedFileNames(self): + return self._splittedFileNames + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists + + def getTmpDirectory(self): + return self._directory + + +if __name__ == "__main__": + description = "Convert To NC-List v1.0.0: Convert a mapping or transcript file into a NC-List. [Category: NC-List]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="Query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-d", "--index", dest="index", action="store_true", default=False, help="create an index [default: false] [format: boolean]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in NCList format]") + parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input file is already sorted [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + ctncl = ConvertToNCList(options.verbosity) + ctncl.setInputFileName(options.inputFileName, options.format) + ctncl.setOutputFileName(options.outputFileName) + ctncl.setIndex(options.index) + ctncl.setSorted(options.sorted) + ctncl.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/FileSorter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/FileSorter.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,210 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +try: + import cPickle as pickle +except: + import pickle +import random, os +from heapq import heapify, heappop, heappush +from itertools import islice, cycle +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +BUFFER_SIZE = 100 * 1024 + +class FileSorter(object): + + def __init__(self, parser, verbosity = 1): + self._parser = parser + self._verbosity = verbosity + self._chunks = {} + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._perChromosome = False + self._isPreSorted = False + self._outputFileNames = {} + self._prefix = "tmpFile_%d" % (random.randint(0, 100000)) + self._chromosome = None + if "SMARTTMPPATH" in os.environ: + self._prefix = os.path.join(os.environ["SMARTTMPPATH"], self._prefix) + + def selectChromosome(self, chromosome): + self._chromosome = chromosome + + def perChromosome(self, boolean): + self._perChromosome = boolean + + def setOutputFileName(self, fileName): + self._outputFileName = fileName + if self._perChromosome: + self._outputFileName = os.path.splitext(self._outputFileName)[0] + + def setPresorted(self, presorted): + self._isPreSorted = presorted + + def sort(self): + if not self._isPreSorted: + self._batchSort() + else: + self._presorted() + + def _presorted(self): + progress = UnlimitedProgress(1000, "Writing files %s" % (self._parser.fileName), self._verbosity) + curChromosome = None + outputHandle = None + + if not self._perChromosome: + outputHandle = open(self._outputFileName, "wb") + for transcript in self._parser.getIterator(): + progress.inc() + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + if self._chromosome != None and chromosome != self._chromosome: + continue + self._nbElements += 1 + self._nbElementsPerChromosome[chromosome] = self._nbElementsPerChromosome.get(chromosome, 0) + 1 + if self._perChromosome: + if chromosome != curChromosome: + if outputHandle != None: + outputHandle.close() + self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome) + outputHandle = open(self._outputFileNames[chromosome], "wb") + curChromosome = chromosome + outputHandle.writelines("%s" % pickle.dumps(transcript)) + if outputHandle != None: + outputHandle.close() + progress.done() + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def _printSorted(self, chromosome, chunk): + chunk.sort(key = lambda transcript: (transcript.getStart(), -transcript.getEnd())) + outputChunk = open("%s_%s_%06i.tmp" % (self._prefix, chromosome, len(self._chunks[chromosome])), "wb", 32000) + self._chunks[chromosome].append(outputChunk) + for transcript in chunk: + outputChunk.write(pickle.dumps(transcript, -1)) + outputChunk.close() + + def _merge(self, chunks): + values = [] + for chunk in chunks: + chunk = open(chunk.name, "rb") + try: + transcript = pickle.load(chunk) + start = transcript.getStart() + end = -transcript.getEnd() + except EOFError: + try: + chunk.close() + chunks.remove(chunk) + os.remove(chunk.name) + except: + pass + else: + heappush(values, (start, end, transcript, chunk)) + while values: + start, end, transcript, chunk = heappop(values) + yield transcript + try: + transcript = pickle.load(chunk) + start = transcript.getStart() + end = -transcript.getEnd() + except EOFError: + try: + chunk.close() + chunks.remove(chunk) + os.remove(chunk.name) + except: + pass + else: + heappush(values, (start, end, transcript, chunk)) + + def _batchSort(self): + currentChunks = {} + counts = {} + try: + progress = UnlimitedProgress(1000, "Sorting file %s" % (self._parser.fileName), self._verbosity) + for transcript in self._parser.getIterator(): + progress.inc() + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + if self._chromosome != None and chromosome != self._chromosome: + continue + if chromosome not in self._chunks: + self._chunks[chromosome] = [] + currentChunks[chromosome] = [] + counts[chromosome] = 0 + currentChunks[chromosome].append(transcript) + counts[chromosome] += 1 + if counts[chromosome] == BUFFER_SIZE: + self._printSorted(chromosome, currentChunks[chromosome]) + currentChunks[chromosome] = [] + counts[chromosome] = 0 + self._nbElements += 1 + self._nbElementsPerChromosome[chromosome] = self._nbElementsPerChromosome.get(chromosome, 0) + 1 + for chromosome in self._chunks: + if counts[chromosome] > 0: + self._printSorted(chromosome, currentChunks[chromosome]) + progress.done() + if not self._perChromosome: + outputHandle = open(self._outputFileName, "wb") + progress = Progress(len(self._chunks), "Writing sorted file %s" % (self._parser.fileName), self._verbosity) + for chromosome in self._chunks: + if self._perChromosome: + self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome) + outputHandle = open(self._outputFileNames[chromosome], "wb") + for sequence in self._merge(self._chunks[chromosome]): + pickle.dump(sequence, outputHandle, -1) + if self._perChromosome: + outputHandle.close() + progress.inc() + if not self._perChromosome: + outputHandle.close() + progress.done() + finally: + for chunks in self._chunks.values(): + for chunk in chunks: + try: + chunk.close() + os.remove(chunk.name) + except Exception: + pass + + def getOutputFileNames(self): + return self._outputFileNames diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,197 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import struct +import math +import os +from optparse import OptionParser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.FileSorter import FileSorter +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.structure.Transcript import Transcript + +LONGSIZE = struct.calcsize('l') + +class FindOverlapsWithOneInterval(object): + + def __init__(self, verbosity): + self._sortedFileName = None + self._verbosity = verbosity + self._overlappingNames = [] + self._nbOverlaps = 0 + self._nbWritten = 0 + + def __del__(self): + if self._sortedFileName and os.path.exists(self._sortedFileName): + os.remove(self._sortedFileName) + + def close(self): + self._iWriter.close() + + def setOutputFileName(self, fileName): + self._iWriter = Gff3Writer(fileName) + + def setFileName(self, fileName, format): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._parser = chooser.getParser(fileName) + self._sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setInterval(self, chromosome, start, end): + self._chromosome = chromosome + self._start = start + self._end = end + self._transcript = Transcript() + self._transcript.setChromosome(chromosome) + self._transcript.setStart(start) + self._transcript.setEnd(end) + self._transcript.setDirection("+") + + def setTranscript(self, transcript): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + self._chromosome = transcript.getChromosome() + self._start = transcript.getStart() + self._end = transcript.getEnd() + self._transcript = transcript + + def prepareIntermediateFiles(self): + fs = FileSorter(self._parser, self._verbosity-4) + fs.selectChromosome(self._chromosome) + fs.perChromosome(False) + fs.setOutputFileName(self._sortedFileName) + fs.sort() + self._nbTotalLines = fs.getNbElements() + self._nbLines = fs.getNbElementsPerChromosome()[self._chromosome] + + def createNCList(self): + if self._verbosity > 2: + print "Creating NC-list..." + ncList = NCList(self._verbosity) + ncList.createIndex(True) + ncList.setChromosome(self._chromosome) + ncList.setFileName(self._sortedFileName) + ncList.setNbElements(self._nbTotalLines) + ncList.buildLists() + self.setNCList(ncList, ncList.getIndex()) + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def setNCList(self, ncList, index): + self._ncList = ncList + self._indix = index + + def binarySearch(self, cursor, startL, endL): + if startL > endL: + return None + middleL = (startL + endL) / 2 + cursor.moveSibling(middleL) + overlap = self.isOverlapping(cursor) + if overlap == 0: + if middleL == startL: + return cursor + else: + return self.binarySearch(cursor, startL, middleL) + if overlap == -1: + return self.binarySearch(cursor, middleL + 1, endL) + return self.binarySearch(cursor, startL, middleL - 1) + + def compare(self, cursor = None): + self._ncList.openFiles() + if cursor == None: + dump = True + cursor = NCListCursor(None, self._ncList, 0, self._verbosity) + cursor._getSiblingData() + cursor = self.binarySearch(cursor, cursor._firstSiblingLIndex, cursor._lastSiblingLIndex) + if cursor == None: + return + while not cursor.isOut() and self.isOverlapping(cursor) == 0: + self.write(cursor) + newCursor = NCListCursor(cursor) + if newCursor.hasChildren(): + newCursor.moveDown() + self.compare(newCursor) + if cursor.isLast(): + return + cursor.moveRight() + + def isOverlapping(self, cursor): + if self._end < cursor.getStart(): + return 1 + if self._start > cursor.getEnd(): + return -1 + return 0 + + def write(self, cursor): + self._nbOverlaps += 1 + refTranscript = cursor.getTranscript() + self._overlappingNames.append(refTranscript.getName()) + + def dumpWriter(self): + if (not self._overlappingNames) or self._transcript == None: + return + self._transcript.setTagValue("nbOverlaps", len(self._overlappingNames)) + self._transcript.setTagValue("overlapsWith", "--".join(self._overlappingNames)) + self._iWriter.addTranscript(self._transcript) + self._nbWritten += 1 + self._overlappingNames = [] + + def run(self): + self.prepareIntermediateFiles() + self.createNCList() + self.compare() + self.dumpWriter() + self.close() + if self._verbosity > 0: + print "# refs: %d" % (self._nbLines) + print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) + + +if __name__ == "__main__": + description = "FindOverlapsWithOneInterval: Finds overlaps with one query interval." + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="Input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="Format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-s", "--start", dest="start", action="store", type="int", help="The start of the query interval [compulsory] [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", type="int", help="The end of the query interval [compulsory] [format: int]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", type="string", help="Chromosome of the query interval [compulsory] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + iFOWOI = FindOverlapsWithOneInterval(options.verbosity) + iFOWOI.setFileName(options.inputFileName, options.format) + iFOWOI.setInterval(options.chromosome, options.start, options.end) + iFOWOI.setOutputFileName(options.outputFileName) + iFOWOI.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervals.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,182 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os, struct, time +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.FindOverlapsWithOneInterval import FindOverlapsWithOneInterval + +REFERENCE = 0 +QUERY = 1 +TYPETOSTRING = {0: "reference", 1: "query"} + +class FindOverlapsWithSeveralIntervals(object): + + def __init__(self, verbosity = 1): + self._parsers = {} + self._outputFileName = "outputOverlaps.gff3" + self._iWriter = None + self._nbLines = {REFERENCE: 0, QUERY: 0} + self._verbosity = verbosity + self._ncLists = {} + self._sortedRefFileNames = None + self._transQueryFileName = None + self._cursors = {} + self._iFowoi = FindOverlapsWithOneInterval(self._verbosity) + + def __del__(self): + self.close() + for fileName in (self._sortedRefFileNames, self._transQueryFileName): + if os.path.exists(fileName): + os.remove(fileName) + + def close(self): + self._iFowoi.close() + + def setRefFileName(self, fileName, format): + self.setFileName(fileName, format, REFERENCE) + self._sortedRefFileNames = "%s_ref_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setQueryFileName(self, fileName, format): + self.setFileName(fileName, format, QUERY) + self._transQueryFileName = "%s_query_trans.pkl" % (os.path.splitext(fileName)[0]) + + def setFileName(self, fileName, format, type): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._parsers[type] = chooser.getParser(fileName) + + def setOutputFileName(self, outputFileName): + self._iFowoi.setOutputFileName(outputFileName) + + def _sortRefFile(self): + fs = FileSorter(self._parsers[REFERENCE], self._verbosity-4) + fs.perChromosome(True) + fs.setOutputFileName(self._sortedRefFileNames) + fs.sort() + self._nbLines[REFERENCE] = fs.getNbElements() + self._nbRefLinesPerChromosome = fs.getNbElementsPerChromosome() + self._splittedFileNames = fs.getOutputFileNames() + + def _translateQueryFile(self): + pickler = NCListFilePickle(self._transQueryFileName, self._verbosity) + progress = UnlimitedProgress(1000, "Translating query data", self._verbosity-4) + cpt = 0 + for queryTranscript in self._parsers[QUERY].getIterator(): + pickler.addTranscript(queryTranscript) + progress.inc() + cpt += 1 + progress.done() + self._nbLines[QUERY] = cpt + self._parsers[QUERY] = NCListFileUnpickle(self._transQueryFileName, self._verbosity) + + def prepareIntermediateFiles(self): + self._sortRefFile() + self._translateQueryFile() + + def createNCLists(self): + self._ncLists = {} + self._indices = {} + self._cursors = {} + for chromosome, fileName in self._splittedFileNames.iteritems(): + if self._verbosity > 3: + print " chromosome %s" % (chromosome) + ncList = NCList(self._verbosity) + ncList.createIndex(True) + ncList.setChromosome(chromosome) + ncList.setFileName(fileName) + ncList.setNbElements(self._nbRefLinesPerChromosome[chromosome]) + ncList.buildLists() + self._ncLists[chromosome] = ncList + cursor = NCListCursor(None, ncList, 0, self._verbosity) + self._cursors[chromosome] = cursor + self._indices[chromosome] = ncList.getIndex() + endTime = time.time() + + def compare(self): + progress = Progress(self._nbLines[QUERY], "Comparing data", self._verbosity-3) + startTime = time.time() + for cpt, queryTranscript in enumerate(self._parsers[QUERY].getIterator()): + chromosome = queryTranscript.getChromosome() + if chromosome not in self._ncLists: + continue + self._iFowoi.setNCList(self._ncLists[chromosome], self._indices[chromosome]) + self._iFowoi.setTranscript(queryTranscript) + self._iFowoi.compare() + self._iFowoi.dumpWriter() + progress.inc() + progress.done() + endTime = time.time() + self._timeSpent = endTime - startTime + + def run(self): + startTime = time.time() + if self._verbosity > 2: + print "Creating NC-list..." + self.prepareIntermediateFiles() + self.createNCLists() + endTime = time.time() + if self._verbosity > 2: + print " ...done (%.2gs)" % (endTime - startTime) + self.compare() + self.close() + if self._verbosity > 0: + print "# queries: %d" % (self._nbLines[QUERY]) + print "# refs: %d" % (self._nbLines[REFERENCE]) + print "# written: %d (%d overlaps)" % (self._iFowoi._nbWritten, self._iFowoi._nbOverlaps) + print "time: %.2gs" % (self._timeSpent) + + +if __name__ == "__main__": + description = "FindOverlaps With Several Intervals v1.0.0: Finds overlaps with several query intervals. [Category: Data comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--query", dest="inputQueryFileName", action="store", type="string", help="Query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--queryFormat", dest="queryFormat", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--ref", dest="inputRefFileName", action="store", type="string", help="Reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--refFormat", dest="refFormat", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + iFWSI = FindOverlapsWithSeveralIntervals(options.verbosity) + iFWSI.setRefFileName(options.inputRefFileName, options.refFormat) + iFWSI.setQueryFileName(options.inputQueryFileName, options.queryFormat) + iFWSI.setOutputFileName(options.outputFileName) + iFWSI.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsBin.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsBin.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,204 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random, os, os.path, time, sqlite3 +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +try: + import cPickle as pickle +except: + import pickle + +MINBIN = 3 +MAXBIN = 7 + + +def getBin(start, end): + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)) + return int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + +def getOverlappingBins(start, end): + array = [] + bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +class FindOverlapsWithSeveralIntervalsBin(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.randomNumber = random.randint(0, 10000) + self.dbName = "smartdb%d" % (self.randomNumber) + if "SMARTTMPPATH" in os.environ: + self.dbName = os.join(os.environ["SMARTTMPPATH"], self.dbName) + self.connection = sqlite3.connect(self.dbName) + self.tableNames = {} + self.nbQueries = 0 + self.nbRefs = 0 + self.nbWritten = 0 + self.nbOverlaps = 0 + cursor = self.connection.cursor() + cursor.execute("PRAGMA journal_mode = OFF") + cursor.execute("PRAGMA synchronous = 0") + cursor.execute("PRAGMA locking_mode = EXCLUSIVE") + cursor.execute("PRAGMA count_change = OFF") + cursor.execute("PRAGMA temp_store = 2") + + def __del__(self): + cursor = self.connection.cursor() + for tableName in self.tableNames.values(): + cursor.execute("DROP TABLE IF EXISTS %s" % (tableName)) + if os.path.exists(self.dbName): + os.remove(self.dbName) + + def createTable(self, chromosome): + cursor = self.connection.cursor() + tableName = "tmpTable_%s_%d" % (chromosome.replace("-", "_"), self.randomNumber) + cursor.execute("CREATE TABLE %s (start INT, end INT, transcript BLOB, bin INT)" % (tableName)) + cursor.execute("CREATE INDEX index_%s ON %s (bin)" % (tableName, tableName)) + self.tableNames[chromosome] = tableName + + def setReferenceFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + startTime = time.time() + if self.verbosity > 2: + print "Storing into table" + for transcript in parser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + transcriptString = pickle.dumps(transcript) + chromosome = transcript.getChromosome() + if chromosome not in self.tableNames: + self.createTable(chromosome) + start = transcript.getStart() + end = transcript.getEnd() + bin = getBin(start, end) + cursor = self.connection.cursor() + cursor.execute("INSERT INTO %s (start, end, transcript, bin) VALUES (?, ?, ?, ?)" % (self.tableNames[chromosome]), (start, end, sqlite3.Binary(transcriptString), bin)) + self.nbRefs += 1 + self.connection.commit() + endTime = time.time() + if self.verbosity > 2: + print " ...done (%.2gs)" % (endTime - startTime) + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + self.nbQueries = self.queryParser.getNbItems() + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def compare(self): + progress = Progress(self.nbQueries, "Reading queries", self.verbosity) + startTime = time.time() + for queryTranscript in self.queryParser.getIterator(): + if queryTranscript.__class__.__name__ == "Mapping": + queryTranscript = queryTranscript.getTranscript() + progress.inc() + queryChromosome = queryTranscript.getChromosome() + if queryChromosome not in self.tableNames: + continue + queryStart = queryTranscript.getStart() + queryEnd = queryTranscript.getEnd() + bins = getOverlappingBins(queryStart, queryEnd) + commands = [] + for bin in bins: + command = "SELECT * FROM %s WHERE bin " % (self.tableNames[queryChromosome]) + if bin[0] == bin[1]: + command += "= %d" % (bin[0]) + else: + command += "BETWEEN %d AND %d" % (bin[0], bin[1]) + commands.append(command) + command = " UNION ".join(commands) + cursor = self.connection.cursor() + cursor.execute(command) + overlap = False + line = cursor.fetchone() + while line: + refStart, refEnd, refTranscriptString, refBin = line + if refStart <= queryEnd and refEnd >= queryStart: + refTranscript = pickle.loads(str(refTranscriptString)) + if refTranscript.overlapWith(queryTranscript): + overlap = True + self.nbOverlaps += 1 + line = cursor.fetchone() + if overlap: + self.writer.addTranscript(queryTranscript) + self.nbWritten += 1 + progress.done() + endTime = time.time() + self.timeSpent = endTime - startTime + + def displayResults(self): + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps) + print "time: %.2gs" % (self.timeSpent) + + def run(self): + self.compare() + self.displayResults() + +if __name__ == "__main__": + + description = "Find Overlaps With Several Intervals Using Bin v1.0.1: Use MySQL binning to compare intervals. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + fowsib = FindOverlapsWithSeveralIntervalsBin(options.verbosity) + fowsib.setQueryFile(options.inputFileName1, options.format1) + fowsib.setReferenceFile(options.inputFileName2, options.format2) + fowsib.setOutputFile(options.outputFileName) + fowsib.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsIndex.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsIndex.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,137 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random, os, time, MySQLdb +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + + +class FindOverlapsWithSeveralIntervalsIndex(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + randomNumber = random.randint(0, 10000) + self.dbName = "smartdb" + if "SMARTTMPPATH" in os.environ: + self.dbName = os.join(os.environ["SMARTTMPPATH"], self.dbName) + self.db = MySQLdb.connect(db = self.dbName) + self.tableName = "table_%s" % (randomNumber) + self.nbQueries = 0 + self.nbRefs = 0 + self.nbOverlaps = 0 + + def __del__(self): + cursor = self.db.cursor() + cursor.execute("DROP TABLE IF EXISTS %s" % (self.tableName)) + + + def setReferenceFile(self, fileName, format): + cursor = self.db.cursor() + cursor.execute("CREATE TABLE %s (start INT, end INT)" % (self.tableName)) + cursor.execute("CREATE INDEX index_%s ON %s (start, end)" % (self.tableName, self.tableName)) + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + progress = UnlimitedProgress(1000, "Reading references", self.verbosity) + for transcript in parser.getIterator(): + start = transcript.getStart() + end = transcript.getEnd() + cursor = self.db.cursor() + cursor.execute("INSERT INTO %s (start, end) VALUES (%d, %d)" % (self.tableName, start, end)) + self.nbRefs += 1 + progress.inc() + self.db.commit() + progress.done() + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + self.nbQueries = self.queryParser.getNbTranscripts() + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def compare(self): + progress = Progress(self.nbQueries, "Reading queries", self.verbosity) + startTime = time.time() + for queryTranscript in self.queryParser.getIterator(): + queryStart = queryTranscript.getStart() + queryEnd = queryTranscript.getEnd() + command = "SELECT 1 FROM %s WHERE start <= %d and end >= %d" % (self.tableName, queryEnd, queryStart) + cursor = self.db.cursor() + cursor.execute(command) + overlap = False + line = cursor.fetchone() + while line: + overlap = True + line = cursor.fetchone() + if overlap: + self.writer.addTranscript(queryTranscript) + self.nbOverlaps += 1 + progress.inc() + progress.done() + endTime = time.time() + self.timeSpent = endTime - startTime + + def displayResults(self): + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# overlaps: %d" % (self.nbOverlaps) + print "time: %.2gs" % (self.timeSpent) + + def run(self): + self.compare() + self.displayResults() + +if __name__ == "__main__": + + description = "Find Overlaps With Several Intervals Using Indices v1.0.1: Use MySQL to compare intervals. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + fowsii = FindOverlapsWithSeveralIntervalsIndex(options.verbosity) + fowsii.setQueryFile(options.inputFileName1, options.format1) + fowsii.setReferenceFile(options.inputFileName2, options.format2) + fowsii.setOutputFile(options.outputFileName) + fowsii.run() + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/FindOverlaps_naif.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/FindOverlaps_naif.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,85 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os +import struct +from optparse import OptionParser +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.Gff3Writer import Gff3Writer + +LONGSIZE = struct.calcsize('l') + +class FindOverlaps_naif(object): + + def __init__(self, inputRefGff3FileName, inputQueryGff3FileName): + self._inputRefGff3FileName = inputRefGff3FileName + self._inputQueryGff3FileName = inputQueryGff3FileName + + def close(self): + self._iGff3Writer.close() + + def setGff3FileName(self, fileName): + self._inputRefGff3FileName = fileName + + def setQueryGff3FileName(self, fileName): + self._inputQueryGff3FileName = fileName + + def setOutputGff3FileName(self, outputGff3FileName): + if outputGff3FileName != '': + self._outputGff3FileName = outputGff3FileName + self._iGff3Writer = Gff3Writer(self._outputGff3FileName) + + def run(self): + queryParser = GffParser(self._inputQueryGff3FileName, 0) + for queryTranscript in queryParser.getIterator(): + ids = [] + refParser = GffParser(self._inputRefGff3FileName, 0) + for refTranscript in refParser.getIterator(): + if queryTranscript.overlapWith(refTranscript): + ids.append(refTranscript.getTagValue('ID')) + if ids: + queryTranscript.setTagValue("nbOverlaps", len(ids)) + queryTranscript.setTagValue("overlapsWith", "--".join(ids)) + self._iGff3Writer.addTranscript(queryTranscript) + +if __name__ == "__main__": + description = "FindOverlapsWithSeveralInterval: Finds overlaps with several query intervals." + + parser = OptionParser(description = description) + parser.add_option("-i", "--inputRef", dest="inputRefGff3FileName", action="store", type="string", help="Reference input file [compulsory] [format: file in gff3 format]") + parser.add_option("-j", "--inputQuery", dest="inputQueryGff3FileName", action="store", type="string", help="Query input file [compulsory] [format: file in gff3 format]") + parser.add_option("-o", "--output", dest="outputGff3FileName", action="store", type="string", help="output file [compulsory] [format: output file in gff3 format]") + (options, args) = parser.parse_args() + + iFON = FindOverlaps_naif(options.inputRefGff3FileName, options.inputQueryGff3FileName) + iFON.setOutputGff3FileName(options.outputGff3FileName) + iFON.run() + iFON.close() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCIndex.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/NCIndex.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,55 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +from SMART.Java.Python.structure.Transcript import Transcript + +class NCIndex(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._step = 10000 + self._indices = [] + + def setStep(self, step): + self._step = step + + def addTranscript(self, end, index): + binStart = len(self._indices) + binEnd = int(end / self._step) + for bin in range(binStart, binEnd+1): + self._indices.append(index) + + def getIndex(self, transcript): + bin = int(transcript.getStart() / self._step) + if bin >= len(self._indices): + return self._indices[-1] + return self._indices[bin] + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCIndex.pyc Binary file smart_toolShed/SMART/Java/Python/ncList/NCIndex.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/NCList.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,337 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, os.path +import struct +import shelve +import sys +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.NCIndex import NCIndex +from SMART.Java.Python.misc.Progress import Progress + +LONG_SIZE = struct.calcsize('l') + +H = 0 +L = 1 +T = 2 +G = 3 + +H_CELL_SIZE = 2 +L_CELL_SIZE = 5 +T_CELL_SIZE = 6 + +START = 0 +END = 1 +ADDRESS = 2 +LIST = 3 +PARENT = 4 +NEW = 5 +LENGTH = 1 + +def pack(input): + return struct.pack("l", long(input)) +def unpack(input): + return struct.unpack("l", input)[0] + + +class NCList(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._subPos = 0 + self._parentPos = 0 + self._nbLines = 0 + self._nbLists = 0 + self._chromosome = None + self._transcriptFileName = None + self._lHandle = None + self._hHandle = None + self._tHandle = None + self._parser = None + self._sizeDict = {H: H_CELL_SIZE, L: L_CELL_SIZE, T: T_CELL_SIZE} + self._offsets = {H: 0, L: 0, G: 0} + self._fileNameDict = {} + self._handleDict = {} + self._createIndex = False + self._missingValues = dict([table, {}] for table in self._sizeDict) + self._missingValues[T][LIST] = -1 + self._missingValues[L][LIST] = 0 + self._missingValues[T][NEW] = -1 + + def __del__(self): + for handle in (self._lHandle, self._hHandle): + if handle != None: + handle.close() + + def createIndex(self, boolean): + self._createIndex = boolean + + def setChromosome(self, chromosome): + self._chromosome = chromosome + + def setFileName(self, fileName): + self._transcriptFileName = fileName + self._parser = NCListFileUnpickle(fileName, self._verbosity) + self._setFileNames(fileName) + + def setNbElements(self, nbElements): + self._nbLines = nbElements + + def setOffset(self, fileType, offset): + self._offsets[fileType] = offset + + def _setFileNames(self, fileName): + if self._chromosome != None and fileName != None: + coreName = os.path.splitext(fileName)[0] + if "SMARTTMPPATH" in os.environ: + coreName = os.path.join(os.environ["SMARTTMPPATH"], coreName) + self._hFileName = "%s_H.bin" % (coreName) + self._lFileName = "%s_L.bin" % (coreName) + self._tFileName = "%s_T.bin" % (coreName) + self._fileNameDict = {H: self._hFileName, L: self._lFileName, T: self._tFileName} + + def getSizeFirstList(self): + return self._sizeFirstList + + def _writeSubListIntoH(self, SubListAddr, SubListLength): + self._hHandle.write(pack(SubListAddr)) + self._hHandle.write(pack(SubListLength)) + self._subPos += H_CELL_SIZE + + def _writeParentIntoL(self, readAddr, subListAddr, parentAddr, start, end): + self._lHandle.write(pack(start)) + self._lHandle.write(pack(end)) + self._lHandle.write(pack(readAddr)) + self._lHandle.write(pack(subListAddr)) + self._lHandle.write(pack(parentAddr)) + self._parentPos += L_CELL_SIZE + + def getLLineElements(self, subListLAddr): + if subListLAddr == -1 or subListLAddr == None: + #print "reading bad from L", subListLAddr + return -1, -1, -1, -1, -1 + else: + self._lHandle.seek(subListLAddr * L_CELL_SIZE * LONG_SIZE + self._offsets[L]) + start = self._lHandle.read(LONG_SIZE) + if len(start) < LONG_SIZE: + #print "reading very bad from L", subListLAddr + return -1, -1, -1, -1, -1 + start = unpack(start) + end = unpack(self._lHandle.read(LONG_SIZE)) + gff3Addr = unpack(self._lHandle.read(LONG_SIZE)) + subListHAddr = unpack(self._lHandle.read(LONG_SIZE)) + parentLAddr = unpack(self._lHandle.read(LONG_SIZE)) + #print "reading from L", subListLAddr, "-->", gff3Addr, subListHAddr, parentLAddr, start, end + return gff3Addr, subListHAddr, parentLAddr, start, end + + def getHLineElements(self, subListHAddr): + self._hHandle.seek(subListHAddr * H_CELL_SIZE * LONG_SIZE + self._offsets[H]) + subListStartBin = self._hHandle.read(LONG_SIZE) + if len(subListStartBin) < 8 : + #print "reading bad from H" + return -1, -1 + subListStart = unpack(subListStartBin) + subListElementsNb = unpack(self._hHandle.read(LONG_SIZE)) + #print "reading from H", subListHAddr, "-->", subListStart, subListElementsNb + return subListStart, subListElementsNb + + def getRefGffAddr(self, currentRefLAddr): + RefGff3Addr, subListHAddr, parentLAddr, start, end = self.getLLineElements(currentRefLAddr) + return RefGff3Addr + + def getIntervalFromAdress(self, address): + self._parser.gotoAddress(int(address) + self._offsets[G]) + iTranscrit = self._parser.getNextTranscript() + return iTranscrit + + def removeFiles(self): + return + + def buildLists(self): + if self._createIndex: + self._index = NCIndex(self._verbosity) + self._createTables() + self._labelLists() + self._computeSubStart() + self._computeAbsPosition() + self._cleanFiles() + + def _createTables(self): + self._initLists() + self._createTable(H, self._nbLists) + self._createTable(T, self._nbLines) + self._createTable(L, self._nbLines) + self._fillTables() + + def _initLists(self): + previousTranscript = None + self._nbLists = 1 + progress = Progress(self._nbLines, "Initializing lists", self._verbosity-5) + for transcript in self._parser.getIterator(): + if self._isIncluded(transcript, previousTranscript): + self._nbLists += 1 + previousTranscript = transcript + progress.inc() + progress.done() + + def _isIncluded(self, transcript1, transcript2): + return transcript1 != None and transcript2 != None and transcript1.getStart() >= transcript2.getStart() and transcript1.getEnd() <= transcript2.getEnd() + + def _createTable(self, name, size): + handle = open(self._fileNameDict[name], "w+b") + progress = Progress(self._sizeDict[name] * size, "Initializing table %d" % (name), self._verbosity-5) + for i in xrange(self._sizeDict[name] * size): + handle.write(pack(-1)) + progress.inc() + progress.done() + self._handleDict[name] = handle + + def _fillTables(self): + progress = Progress(self._nbLines, "Filling table T", self._verbosity-5) + for i, transcript in enumerate(self._parser.getIterator()): + self._writeValue(T, i, START, transcript.getStart()) + self._writeValue(T, i, END, transcript.getEnd()) + self._writeValue(T, i, ADDRESS, self._parser.getCurrentTranscriptAddress()) + self._writeValue(T, i, PARENT, -1) + self._writeValue(T, i, LIST, -1) + progress.inc() + progress.done() + progress = Progress(self._nbLists, "Filling table H", self._verbosity-5) + for i in xrange(self._nbLists): + self._writeValue(H, i, LENGTH, 0) + progress.inc() + progress.done() + + def _labelLists(self): + progress = Progress(self._nbLines, "Getting table structure", self._verbosity-5) + nextL = 0 + for i in xrange(self._nbLines): + p = i - 1 + start = self._readValue(T, i, START) + end = self._readValue(T, i, END) + while p != -1 and (start < self._readValue(T, p, START) or end > self._readValue(T, p, END)): + p = self._readValue(T, p, PARENT) + thisL = self._readValue(T, p, LIST) + if thisL == -1: + #print "entering" + thisL = nextL + nextL += 1 + length = 0 + self._writeValue(T, p, LIST, thisL) + else: + length = self._readValue(H, thisL, LENGTH) + self._writeValue(T, i, PARENT, p) + self._writeValue(H, thisL, LENGTH, length + 1) + progress.inc() + progress.done() + + def _computeSubStart(self): + progress = Progress(self._nbLines, "Getting table sub-lists", self._verbosity-5) + total = 0 + for i in xrange(self._nbLists): + self._writeValue(H, i, START, total) + total += self._readValue(H, i, LENGTH) + self._writeValue(H, i, LENGTH, 0) + progress.inc() + progress.done() + + def _computeAbsPosition(self): + progress = Progress(self._nbLines, "Writing table", self._verbosity-5) + self._sizeFirstList = 0 + for i in xrange(self._nbLines): + s = self._readValue(T, i, START) + e = self._readValue(T, i, END) + a = self._readValue(T, i, ADDRESS) + pt = self._readValue(T, i, PARENT) + h = self._readValue(T, pt, LIST) + pl = self._readValue(T, pt, NEW) + nb = self._readValue(H, h, LENGTH) + l = self._readValue(H, h, START) + nb + self._writeValue(T, i, NEW, l) + self._writeValue(L, l, START, s) + self._writeValue(L, l, END, e) + self._writeValue(L, l, ADDRESS, a) + self._writeValue(L, l, LIST, -1) + self._writeValue(L, l, PARENT, pl) + self._writeValue(H, h, LENGTH, nb+1) + if nb == 0: + #print "adding it" + self._writeValue(L, pl, LIST, h) + if pl == -1: + self._sizeFirstList += 1 + if self._createIndex: + self._index.addTranscript(e, l) + progress.inc() + progress.done() + + def closeFiles(self): + for handle in self._handleDict.values(): + handle.close() + del self._handleDict + self._lHandle = None + self._hHandle = None + self._tHandle = None + self._parser = None + + def openFiles(self): + self._lHandle = open(self._fileNameDict[L], "rb") + self._hHandle = open(self._fileNameDict[H], "rb") + self._handleDict = {H: self._hHandle, L: self._lHandle} + self._parser = NCListFileUnpickle(self._transcriptFileName, self._verbosity) + + def _cleanFiles(self): + self.closeFiles() + os.remove(self._fileNameDict[T]) + + def _getPosition(self, table, line, key): + handle = self._handleDict[table] + handle.seek(self._sizeDict[table] * line * LONG_SIZE + key * LONG_SIZE) + return handle + + def _writeValue(self, table, line, key, value): + #print "writing", table, line, key, "<-", value + if line == -1: + self._missingValues[table][key] = value + return + handle = self._getPosition(table, line, key) + handle.write(pack(value)) + + def _readValue(self, table, line, key): + #print "reading", table, line, key, "->", + if line == -1: + #print self._missingValues[table][key] + return self._missingValues[table][key] + handle = self._getPosition(table, line, key) + r = unpack(handle.read(LONG_SIZE)) + #print r + return r + + def getIndex(self): + return self._index diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCList.pyc Binary file smart_toolShed/SMART/Java/Python/ncList/NCList.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCListCursor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/NCListCursor.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,325 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, os.path, struct +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.Progress import Progress + + +class Data(object): + def __init__(self, hIndex, transcript, firstChildLIndex, lastChildLIndex, start, end): + self.hIndex = hIndex + self.transcript = transcript + self.firstChildLIndex = firstChildLIndex + self.lastChildLIndex = lastChildLIndex + self.start = start + self.end = end + +class NCListCursor(object): + + def __init__(self, cursor = None, ncList = None, lIndex = 0, verbosity = 0): + self._verbosity = verbosity + self._mainListData = [] + if cursor: + self.copy(cursor) + else: + self._ncList = ncList + self.setLIndex(lIndex) + + def setLIndex(self, lIndex): + self._lIndex = lIndex + self._start = None + self._end = None + self._hIndex = None + self._gffIndex = None + self._parentGffIndex = None + self._parentLIndex = None + self._parentHIndex = None + self._parentStart = None + self._parentEnd = None + self._transcript = None + self._firstSiblingLIndex = None + self._lastSiblingLIndex = None + self._firstChildLIndex = None + self._lastChildLIndex = None + self._mainListIndex = lIndex if lIndex < self._ncList.getSizeFirstList() else None + + def precompute(self): + self._mainListIndex = 0 + progress = Progress(self._ncList.getSizeFirstList(), "Precomputing data", self._verbosity) + for i in range(self._ncList.getSizeFirstList()): + gffIndex, hIndex, parentLIndex, start, end = self._ncList.getLLineElements(i) + transcript = self._ncList.getIntervalFromAdress(gffIndex) + firstChildLIndex, nbChildren = self._ncList.getHLineElements(hIndex) + lastChildLIndex = -1 if firstChildLIndex == -1 else firstChildLIndex + nbChildren-1 + self._mainListData.append(Data(hIndex, transcript, firstChildLIndex, lastChildLIndex, start, end)) + progress.inc() + progress.done() + + def _updateFromMainListData(self): + if not self._mainListData or self._lIndex >= self._ncList.getSizeFirstList(): + #print "OUT" + return False + if self._mainListIndex >= self._ncList.getSizeFirstList(): + self._hIndex = -1 + data = self._mainListData[self._mainListIndex] + self._hIndex = data.hIndex + self._transcript = data.transcript + self._firstChildLIndex = data.firstChildLIndex + self._lastChildLIndex = data.lastChildLIndex + self._start = data.start + self._end = data.end + return True + + def getLIndex(self): + return self._lIndex + + def _getCurrentData(self): + self._gffIndex, self._hIndex, self._parentLIndex, self._start, self._end = self._ncList.getLLineElements(self._lIndex) + #print "-->", self._lIndex, "-->", self._gffIndex, self._hIndex, self._parentLIndex, self._start, self._end + if self._end == -1: + raise Exception("Error") + + def _getParentData(self): + if self._parentLIndex == None: + self._getCurrentData() + self._parentGffIndex, self._parentHIndex, greatParentLIndex, self._parentStart, self._parentEnd = self._ncList.getLLineElements(self._parentLIndex) + + def _getTranscript(self): + if self._gffIndex == None: + self._getCurrentData() + self._transcript = self._ncList.getIntervalFromAdress(self._gffIndex) + + def _getSiblingData(self): + if self._parentHIndex == None: + self._getParentData() + if self._parentHIndex == -1: + self._firstSiblingLIndex = 0 + self._lastSiblingLIndex = self._ncList.getSizeFirstList() - 1 + else: + self._firstSiblingLIndex, nbSiblings = self._ncList.getHLineElements(self._parentHIndex) + self._lastSiblingLIndex = -1 if self._firstSiblingLIndex == -1 else self._firstSiblingLIndex + nbSiblings-1 + + def _getChildrenData(self): + if self._hIndex == None: + self._getCurrentData() + self._firstChildLIndex, nbChildren = self._ncList.getHLineElements(self._hIndex) + self._lastChildLIndex = -1 if self._firstChildLIndex == -1 else self._firstChildLIndex + nbChildren-1 + + def getGffAddress(self): + if self._gffIndex == None: + self._getCurrentData() + return self._gffIndex + + def getStart(self): + if self._start == None: + self._getCurrentData() + return self._start + + def getEnd(self): + if self._end == None: + self._getCurrentData() + return self._end + + def compare(self, cursor): + return (self._lIndex == cursor._lIndex) + + def getTranscript(self): + if self.isOut(): + return None + if self._transcript == None: + self._getTranscript() + return self._transcript + + def isFirst(self): + #print "is last: ", self._lIndex, self._ncList.getSizeFirstList(), self._lastSiblingLIndex + if self._lIndex < self._ncList.getSizeFirstList() - 1: + return (self._lIndex == 0) + if self._firstSiblingLIndex == None: + self._getSiblingData() + return (self._lIndex == self._firstSiblingLIndex) + + def isLast(self): + #print "is last: ", self._lIndex, self._ncList.getSizeFirstList(), self._lastSiblingLIndex + if self._lIndex < self._ncList.getSizeFirstList() - 1: + return (self._lIndex == self._ncList.getSizeFirstList() - 1) + if self._lastSiblingLIndex == None: + self._getSiblingData() + return (self._lIndex == self._lastSiblingLIndex) + + def moveUp(self): + if self._parentLIndex == None: + self._getCurrentData() + self._lIndex = self._parentLIndex + self._updateFromMainListData() + self._hIndex = self._parentHIndex + self._gffIndex = self._parentGffIndex + self._parentLIndex = None + self._parentHIndex = None + self._parentGffIndex = None + self._transcript = None + self._firstSiblingLIndex = None + self._lastSiblingLIndex = None + self._firstChildLIndex = self._firstChildLIndex + self._lastChildLIndex = self._lastChildLIndex + self._start = self._parentStart + self._end = self._parentEnd + self._parentStart = None + self._parentEnd = None + + def moveRight(self): + if self.isOut(): + return + #print "IN1", self + if self._lIndex < self._ncList.getSizeFirstList() - 1 and self._mainListIndex != None: + self._mainListIndex += 1 + self._updateFromMainListData() + #print "IN2", self + self._lIndex += 1 + self._hIndex = None + self._start = None + self._end = None + self._transcript = None + self._gffIndex = None + self._firstChildLIndex = None + self._lastChildLIndex = None + #print "IN3", self + + def moveNext(self): + while not self.isOut() and self.isLast(): + if self.isTop(): + self._lIndex = -1 + return + self.moveUp() + #print "F1", self + self.moveRight() + #print "F2", self + + def moveMiddleSibling(self): + if self._lIndex < self._ncList.getSizeFirstList() - 1: + self._mainListIndex = (self._ncList.getSizeFirstList() - 1) / 2 + self._updateFromMainListData() + if self._lastSiblingLIndex == None: + self._getSiblingData() + self._lIndex = (self._lastSiblingLIndex + self._firstSiblingLIndex) / 2 + self._hIndex = None + self._start = None + self._end = None + self._gffIndex = None + self._transcript = None + self._firstChildLIndex = None + self._lastChildLIndex = None + + def moveSibling(self, lIndex): + if self._lIndex < self._ncList.getSizeFirstList() - 1: + self._mainListIndex = lIndex + self._updateFromMainListData() + self._lIndex = lIndex + self._hIndex = None + self._start = None + self._end = None + self._gffIndex = None + self._transcript = None + self._firstChildLIndex = None + self._lastChildLIndex = None + + def moveLastSibling(self): + if self._lIndex < self._ncList.getSizeFirstList() - 1: + self._mainListIndex = self._ncList.getSizeFirstList() - 1 + self._updateFromMainListData() + if self._lastSiblingLIndex == None: + self._getSiblingData() + self._lIndex = self._lastSiblingLIndex + self._hIndex = None + self._start = None + self._end = None + self._gffIndex = None + self._transcript = None + self._firstChildLIndex = None + self._lastChildLIndex = None + + def moveDown(self): + if self._firstChildLIndex == None: + self._getChildrenData() + self._parentLIndex = self._lIndex + self._parentHIndex = self._hIndex + self._parentGffIndex = self._gffIndex + self._lIndex = self._firstChildLIndex + self._lastSiblingLIndex = self._lastChildLIndex + self._hIndex = None + self._gffIndex = None + self._transcript = None + self._firstChildLIndex = None + self._lastChildLIndex = None + self._parentStart = self._start + self._parentEnd = self._end + self._start = None + self._end = None + + def isOut(self): + return (self._lIndex == -1) + + def isTop(self): + if self._parentLIndex == None: + self._getCurrentData() + return (self._parentLIndex == -1) + + def hasChildren(self): + if self._hIndex == None: + self._getCurrentData() + if self._hIndex == -1: + return False + if self._firstChildLIndex == None: + self._getChildrenData() + return (self._firstChildLIndex != -1) + + def copy(self, cursor): + self._ncList = cursor._ncList + self._lIndex = cursor._lIndex + self._hIndex = cursor._hIndex + self._gffIndex = cursor._gffIndex + self._parentLIndex = cursor._parentLIndex + self._parentHIndex = cursor._parentHIndex + self._parentGffIndex = cursor._parentGffIndex + self._transcript = cursor._transcript + self._firstSiblingLIndex = cursor._firstSiblingLIndex + self._lastSiblingLIndex = cursor._lastSiblingLIndex + self._firstChildLIndex = cursor._firstChildLIndex + self._lastChildLIndex = cursor._lastChildLIndex + self._mainListData = cursor._mainListData + self._mainListIndex = cursor._mainListIndex + self._verbosity = cursor._verbosity + self._parentStart = cursor._parentStart + self._parentEnd = cursor._parentEnd + self._start = cursor._start + self._end = cursor._end + + def __str__(self): + return "NC-list: %s, Lindex: %s, Hindex: %s, GFFindex: %s, start: %s, end: %s, parent Lindex: %s, parent Hindex: %s, parent GFFindex: %s, transcript: %s, last sibling: %s" % (self._ncList, self._lIndex, self._hIndex, self._gffIndex, self._start, self._end, self._parentLIndex, self._parentHIndex, self._parentGffIndex, self._transcript, self._lastSiblingLIndex) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCListCursor.pyc Binary file smart_toolShed/SMART/Java/Python/ncList/NCListCursor.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCListFilePickle.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/NCListFilePickle.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,123 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +try: + import cPickle as pickle +except: + import pickle +from SMART.Java.Python.structure.Transcript import Transcript + + +class NCListFilePickle(object): + + def __init__(self, fileName, verbosity = 1): + self.fileName = fileName + self.handle = open(fileName, "wb") + self.verbosity = verbosity + + def __del__(self): + if self.handle != None: + self.handle.close() + + def addTranscript(self, transcript): + pickle.dump(transcript, self.handle, -1) + + def write(self): + pass + + def close(self): + self.__del__() + + +class NCListFileUnpickle(object): + + def __init__(self, fileName, verbosity = 1): + self.handle = open(fileName, "rb") + self.verbosity = verbosity + self.initAddress = 0 + self.address = self.initAddress + self.nbTranscripts = None + self.fileName = fileName + self.over = False + self.chromosome = None + + def __del__(self): + if self.handle != None: + self.handle.close() + + def reset(self): + self.handle.seek(0) + self.initAddress = 0 + + def setChromosome(self, chromosome): + self.chromosome = chromosome + + def getNbTranscripts(self): + if self.nbTranscripts != None: + return self._nbTranscripts + self.nbTranscripts = 0 + for transcript in self.getIterator(): + self_nbTranscripts += 1 + return self.nbTranscripts + + def gotoAddress(self, address): + self.handle.seek(address) + self.address = address + + def getNextTranscript(self): + self.address = self.handle.tell() + try: + transcript = pickle.load(self.handle) + if self.chromosome != None and transcript.getChromosome() != self.chromosome: + self.over = True + return False + return transcript + except EOFError: + self.over = True + return False + + def getIterator(self): + self.gotoAddress(self.initAddress) + while True: + transcript = self.getNextTranscript() + if not transcript: + self.over = True + return + yield transcript + + def setInitAddress(self, address): + self.initAddress = address + + def getCurrentTranscriptAddress(self): + return self.address + + def isOver(self): + return self.over diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCListFilePickle.pyc Binary file smart_toolShed/SMART/Java/Python/ncList/NCListFilePickle.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCListHandler.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/NCListHandler.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,125 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import struct +try: + import cPickle as pickle +except: + import pickle +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCIndex import NCIndex +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle + +LONG_SIZE = struct.calcsize('l') + +INFO_PER_NCLIST = 5 +H_FILE = 0 +L_FILE = 1 +G_FILE = 2 +FIRST_LIST_SIZE = 3 +INDEX = 4 + +H = 0 +L = 1 +T = 2 +G = 3 + +def pack(input): + return struct.pack("l", long(input)) +def unpack(input): + return struct.unpack("l", input)[0] + + +class NCListHandler(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._index = False + + def setFileName(self, fileName): + self._fileName = fileName + self._handle = open(fileName, "rb") + + def loadData(self): + self._chromosomes = pickle.load(self._handle) + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._ncLists = {} + for chromosome in self._chromosomes: + self._nbElementsPerChromosome[chromosome] = unpack(self._handle.read(LONG_SIZE)) + self._nbElements += self._nbElementsPerChromosome[chromosome] + self._headerPos = self._handle.tell() + for i, chromosome in enumerate(self._chromosomes): + ncList = NCList(self._verbosity) + ncList._hHandle = self._handle + ncList._lHandle = self._handle + ncList._parser = NCListFileUnpickle(self._fileName) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + H_FILE * LONG_SIZE) + ncList.setOffset(H, unpack(self._handle.read(LONG_SIZE))) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + L_FILE * LONG_SIZE) + ncList.setOffset(L, unpack(self._handle.read(LONG_SIZE))) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + G_FILE * LONG_SIZE) + ncList.setOffset(G, unpack(self._handle.read(LONG_SIZE))) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + FIRST_LIST_SIZE * LONG_SIZE) + ncList._sizeFirstList = unpack(self._handle.read(LONG_SIZE)) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + INDEX * LONG_SIZE) + indices = unpack(self._handle.read(LONG_SIZE)) + if indices != -1: + self._handle.seek(indices) + data = pickle.load(self._handle) + index = NCIndex(self._verbosity) + index._indices = data + ncList._index = index + self._ncLists[chromosome] = ncList + + def getChromosomes(self): + return self._chromosomes + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists + + def getParser(self, chromosome = None): + parser = NCListFileUnpickle(self._fileName) + if chromosome == None: + parser.setInitAddress(unpack(self._handle, self._headerPos + G_FILE * LONG_SIZE)) + return parser + i = self._chromosomes.index(chromosome) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + G_FILE * LONG_SIZE) + pos = unpack(self._handle.read(LONG_SIZE)) + parser.setInitAddress(pos) + parser.setChromosome(chromosome) + return parser diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCListMerger.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/NCListMerger.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,126 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import struct, os, shutil +try: + import cPickle as pickle +except: + import pickle + +LONG_SIZE = struct.calcsize('l') + +INFO_PER_NCLIST = 5 +H_FILE = 0 +L_FILE = 1 +G_FILE = 2 +FIRST_LIST_SIZE = 3 +INDEX = 4 + +def pack(input): + return struct.pack("l", long(input)) +def unpack(input): + return struct.unpack("l", input)[0] + + +class NCListMerger(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._index = False + + def setFileName(self, fileName): + self._handle = open(fileName, "wb") + + def setNCLists(self, ncLists): + self._ncLists = ncLists + self._chromosomes = sorted(self._ncLists.keys()) + + def addIndex(self, boolean): + self._index = boolean + + def merge(self): + self._writeHeader() + self._addNCLists() + self._handle.close() + self._removeInputFiles() + + def _writeHeader(self): + pickle.dump(self._chromosomes, self._handle, -1) + for chromosome in self._chromosomes: + self._handle.write(pack(self._ncLists[chromosome]._nbLines)) + self._headerPos = self._handle.tell() + for chromosome in self._chromosomes: + for i in range(INFO_PER_NCLIST): + self._handle.write(pack(-1)) + + def _addInHeader(self, i, info, value = None): + currentPos = self._handle.tell() + if value == None: + value = currentPos + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + info * LONG_SIZE) + self._handle.write(pack(value)) + self._handle.seek(currentPos) + + def _addNCLists(self): + self._inputFileNames = [] + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, H_FILE) + hFile = open(ncList._hFileName) + shutil.copyfileobj(hFile, self._handle) + hFile.close() + self._inputFileNames.append(ncList._hFileName) + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, L_FILE) + lFile = open(ncList._lFileName) + shutil.copyfileobj(lFile, self._handle) + lFile.close() + self._inputFileNames.append(ncList._lFileName) + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, FIRST_LIST_SIZE, ncList.getSizeFirstList()) + if self._index: + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, INDEX) + pickle.dump(ncList.getIndex()._indices, self._handle, -1) + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, G_FILE) + tFile = open(ncList._transcriptFileName) + shutil.copyfileobj(tFile, self._handle) + tFile.close() + self._inputFileNames.append(ncList._transcriptFileName) + + def _removeInputFiles(self): + for fileName in self._inputFileNames: + os.remove(fileName) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/NCListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/NCListParser.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,74 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import random, os, time +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +try: + import cPickle as pickle +except: + import pickle + +class NCListParser(object): + + def __init__(self, fileName, verbosity = 1): + self._fileName = fileName + self._ncLists = {} + self._sortedFileNames = {} + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._verbosity = verbosity + + def parse(self): + handle = open(self._fileName) + self._sortedFileNames = pickle.load(handle) + self._nbElements = pickle.load(handle) + self._nbElementsPerChromosome = pickle.load(handle) + self._ncLists = pickle.load(handle) + for ncList in self._ncLists.values(): + ncList._reopenFiles() + handle.close() + + def getSortedFileNames(self): + return self._sortedFileNames + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/__init__.py diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/__init__.pyc Binary file smart_toolShed/SMART/Java/Python/ncList/__init__.pyc has changed diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/MockFindOverlapsWithSeveralIntervals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/MockFindOverlapsWithSeveralIntervals.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,197 @@ +from SMART.Java.Python.misc import Utils + +class MockFindOverlapsWithOneInterval (object) : + def write(self, inFileName): + Utils.writeFile(inFileName, "chr1\ttest\ttest1.1\t0\t1000\t.\t+\t.\tID=test1.1;Name=test1.1\n") + +class MockFindOverlapsWithServeralIntervals_case1 (object) : + def write(self,inFileName): + f = open(inFileName, "w") + f.write("chr1\ttest\ttest1.1\t0\t1000\t1001\t+\t.\tID=test1.1;Name=test1.1\n") + f.write("chr1\ttest\ttest1.2\t50\t350\t301\t+\t.\tID=test1.2;Name=test1.2\n") + f.write("chr1\ttest\ttest1.3\t100\t600\t501\t+\t.\tID=test1.3;Name=test1.3\n") + f.write("chr1\ttest\ttest1.4\t200\t450\t251\t+\t.\tID=test1.4;Name=test1.4\n") + f.write("chr1\ttest\ttest1.5\t700\t950\t251\t+\t.\tID=test1.5;Name=test1.5\n") + f.write("chr1\ttest\ttest1.6\t800\t900\t101\t+\t.\tID=test1.6;Name=test1.6\n") + f.write("chr1\ttest\ttest1.7\t1200\t1300\t101\t+\t.\tID=test1.7;Name=test1.7\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_case2 (object) : + def write(self,inFileName): + f = open(inFileName,'w') + f.write("chr1\ttest\ttest2.1\t0\t500\t501\t+\t.\tID=test2.1;Name=test2.1\n") + f.write("chr1\ttest\ttest2.2\t50\t450\t401\t+\t.\tID=test2.2;Name=test2.2\n") + f.write("chr1\ttest\ttest2.3\t100\t400\t301\t+\t.\tID=test2.3;Name=test2.3\n") + f.write("chr1\ttest\ttest2.4\t100\t200\t101\t+\t.\tID=test2.4;Name=test2.4\n") + f.write("chr1\ttest\ttest2.5\t900\t1200\t301\t+\t.\tID=test2.5;Name=test2.5\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_case3 (object) : + def write(self,inFileName): + f = open(inFileName,'w') + f.write("chr1\ttest\ttest3.1\t0\t500\t501\t+\t.\tID=test3.1;Name=test3.1\n") + f.write("chr1\ttest\ttest3.2\t50\t450\t401\t+\t.\tID=test3.2;Name=test3.2\n") + f.write("chr1\ttest\ttest3.3\t100\t400\t301\t+\t.\tID=test3.3;Name=test3.3\n") + f.write("chr1\ttest\ttest3.4\t100\t200\t101\t+\t.\tID=test3.4;Name=test3.4\n") + f.write("chr1\ttest\ttest3.5\t300\t400\t101\t+\t.\tID=test3.5;Name=test3.5\n") + f.write("chr1\ttest\ttest3.6\t800\t1000\t201\t+\t.\tID=test3.6;Name=test3.6\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_case4_5 (object) : + def write(self,inFileName): + f = open(inFileName,'w') + f.write("chr1\ttest\ttest4.1\t0\t1000\t1001\t+\t.\tID=test4.1;Name=test4.1\n") + f.write("chr1\ttest\ttest4.2\t200\t800\t601\t+\t.\tID=test4.2;Name=test4.2\n") + f.write("chr1\ttest\ttest4.3\t400\t600\t201\t+\t.\tID=test4.3;Name=test4.3\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_case6_7 (object) : + def write(self,inFileName): + f = open(inFileName,'w') + f.write("chr1\ttest\ttest6.1\t0\t1000\t1001\t+\t.\tID=test6.1;Name=test6.1\n") + f.write("chr1\ttest\ttest6.2\t100\t300\t201\t+\t.\tID=test6.2;Name=test6.2\n") + f.write("chr1\ttest\ttest6.3\t400\t500\t101\t+\t.\tID=test6.3;Name=test6.3\n") + f.write("chr1\ttest\ttest6.4\t510\t520\t11\t+\t.\tID=test6.4;Name=test6.4\n") + f.write("chr1\ttest\ttest6.5\t850\t950\t001\t+\t.\tID=test6.5;Name=test6.5\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_case8 (object) : + def write(self,inFileName): + f = open(inFileName,'w') + f.write("chr1\ttest\ttest8.1\t0\t1000\t1001\t+\t.\tID=test8.1;Name=test8.1\n") + f.write("chr1\ttest\ttest8.2\t100\t200\t101\t+\t.\tID=test8.2;Name=test8.2\n") + f.write("chr1\ttest\ttest8.3\t300\t400\t101\t+\t.\tID=test8.3;Name=test8.3\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_case9 (object) : + def write(self,inFileName): + f = open(inFileName,'w') + f.write("chr1\ttest\ttest9.1\t0\t1000\t1001\t+\t.\tID=test9.1;Name=test9.1\n") + f.write("chr1\ttest\ttest9.2\t600\t700\t101\t+\t.\tID=test9.2;Name=test9.2\n") + f.write("chr1\ttest\ttest9.3\t800\t1200\t401\t+\t.\tID=test9.3;Name=test9.3\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_case10 (object) : + def write(self,inFileName): + f = open(inFileName,'w') + f.write("chr1\ttest\ttest10.1\t0\t1000\t1001\t+\t.\tID=test10.1;Name=test10.1\n") + f.write("chr1\ttest\ttest10.2\t100\t200\t101\t+\t.\tID=test10.2;Name=test10.2\n") + f.write("chr1\ttest\ttest10.3\t300\t400\t101\t+\t.\tID=test10.3;Name=test10.3\n") + f.write("chr1\ttest\ttest10.4\t500\t600\t101\t+\t.\tID=test10.4;Name=test10.4\n") + f.write("chr1\ttest\ttest10.5\t1200\t1300\t101\t+\t.\tID=test10.5;Name=test10.5\n") + f.write("chr1\ttest\ttest10.6\t1400\t1500\t101\t+\t.\tID=test10.6;Name=test10.6\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_case11 (object) : + def write(self,inFileName): + f = open(inFileName,'w') + f.write("chr1\ttest\ttest11.1\t0\t500\t501\t+\t.\tID=test11.1;Name=test11.1\n") + f.write("chr1\ttest\ttest11.2\t100\t200\t101\t+\t.\tID=test11.2;Name=test11.2\n") + f.write("chr1\ttest\ttest11.3\t300\t400\t101\t+\t.\tID=test11.3;Name=test11.3\n") + f.write("chr1\ttest\ttest11.4\t700\t900\t201\t+\t.\tID=test11.4;Name=test11.4\n") + f.write("chr1\ttest\ttest11.5\t710\t720\t11\t+\t.\tID=test11.5;Name=test11.5\n") + f.write("chr1\ttest\ttest11.6\t740\t750\t11\t+\t.\tID=test11.6;Name=test11.6\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_case12 (object) : + def write(self,inFileName): + f = open(inFileName,'w') + f.write("chr1\ttest\ttest12.1\t0\t1400\t.\t+\t.\tID=test12.1;Name=test12.1\n") + f.write("chr1\ttest\ttest12.2\t300\t500\t.\t+\t.\tID=test12.2;Name=test12.2\n") + f.write("chr1\ttest\ttest12.3\t300\t500\t.\t+\t.\tID=test12.3;Name=test12.3\n") + f.write("chr1\ttest\ttest12.4\t800\t1100\t.\t+\t.\tID=test12.4;Name=test12.4\n") + f.write("chr1\ttest\ttest12.5\t1200\t1300\t.\t+\t.\tID=test12.5;Name=test12.5\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case1 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery1.1\t25\t150\t126\t+\t.\tID=query_1;Name=query1.1\n") + f.write("chr1\tquery\tquery1.2\t70\t850\t781\t+\t.\tID=query_2;Name=query1.2\n") + f.write("chr1\tquery\tquery1.3\t550\t850\t201\t+\t.\tID=query_3;Name=query1.3\n") + f.write("chr1\tquery\tquery1.4\t925\t1025\t101\t+\t.\tID=query_4;Name=query1.4\n") + f.write("chr1\tquery\tquery1.5\t1201\t1210\t10\t+\t.\tID=query_5;Name=query1.5\n") + f.write("chr1\tquery\tquery1.6\t1500\t1600\t101\t+\t.\tID=query_6;Name=query1.6\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case2 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery2.1\t150\t300\t151\t+\t.\tID=query_1;Name=query2.1\n") + f.write("chr1\tquery\tquery2.2\t300\t450\t151\t+\t.\tID=query_2;Name=query2.2\n") + f.write("chr1\tquery\tquery2.3\t480\t800\t321\t+\t.\tID=query_3;Name=query2.3\n") + f.write("chr1\tquery\tquery2.4\t560\t800\t241\t+\t.\tID=query_4;Name=query2.4\n") + f.write("chr1\tquery\tquery2.5\t850\t1000\t151\t+\t.\tID=query_5;Name=query2.5\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case3 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery3.1\t150\t250\t101\t+\t.\tID=query_1;Name=query3.1\n") + f.write("chr1\tquery\tquery3.2\t380\t400\t21\t+\t.\tID=query_2;Name=query3.2\n") + f.write("chr1\tquery\tquery3.3\t480\t520\t41\t+\t.\tID=query_3;Name=query3.3\n") + f.write("chr1\tquery\tquery3.4\t510\t700\t191\t+\t.\tID=query_4;Name=query3.4\n") + f.write("chr1\tquery\tquery3.5\t900\t950\t41\t+\t.\tID=query_5;Name=query3.5\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case4 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery4.1\t400\t500\t101\t+\t.\tID=query_1;Name=query4.1\n") + f.write("chr1\tquery\tquery4.2\t450\t600\t151\t+\t.\tID=query_2;Name=query4.2\n") + f.write("chr1\tquery\tquery4.3\t700\t800\t101\t+\t.\tID=query_3;Name=query4.3\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case5 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery5.1\t850\t950\t101\t+\t.\tID=query_1;Name=query5.1\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case6 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery6.1\t200\t300\t101\t+\t.\tID=query_1;Name=query6.1\n") + f.write("chr1\tquery\tquery6.2\t800\t900\t101\t+\t.\tID=query_2;Name=query6.2\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case7 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery6.1\t530\t550\t21\t+\t.\tID=query_1;Name=query6.1\n") + f.write("chr1\tquery\tquery6.2\t600\t700\t101\t+\t.\tID=query_2;Name=query6.2\n") + f.write("chr1\tquery\tquery6.3\t650\t900\t251\t+\t.\tID=query_3;Name=query6.3\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case8 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery7.1\t500\t600\t101\t+\t.\tID=query_1;Name=query7.1\n") + f.write("chr1\tquery\tquery7.2\t700\t800\t101\t+\t.\tID=query_2;Name=query7.2\n") + f.write("chr1\tquery\tquery7.3\t900\t1100\t201\t+\t.\tID=query_3;Name=query7.3\n") + f.write("chr1\tquery\tquery7.4\t1200\t1300\t101\t+\t.\tID=query_4;Name=query7.4\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case9 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery8.1\t400\t400\t101\t+\t.\tID=query_1;Name=query8.1\n") + f.write("chr1\tquery\tquery8.2\t550\t650\t101\t+\t.\tID=query_2;Name=query8.2\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case10 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery10.1\t700\t800\t101\t+\t.\tID=query_1;Name=query10.1\n") + f.write("chr1\tquery\tquery10.2\t900\t1000\t101\t+\t.\tID=query_2;Name=query10.2\n") + f.write("chr1\tquery\tquery10.3\t1100\t1300\t201\t+\t.\tID=query_3;Name=query10.3\n") + f.close() + +class MockFindOverlapsWithServeralIntervals_query_case11 (object): + def write(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\tquery11.1\t420\t480\t61\t+\t.\tID=query_1;Name=query11.1\n") + f.write("chr1\tquery\tquery11.2\t450\t715\t266\t+\t.\tID=query_2;Name=query11.2\n") + f.close() + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/MockFindOverlaps_randomExample.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/MockFindOverlaps_randomExample.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,118 @@ +import os +import random +from SMART.Java.Python.getRandomRegions import RandomRegionsGenerator +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.GffParser import GffParser + +class MockFindOverlaps_randomExample(object): + + def __init__(self, fileName, ID, numberOfReads, chromSize): + self._fileName = fileName + self._ID = ID + self._numberOfReads = numberOfReads + self._chromSize = chromSize + + def write(self): + iMFO_RE = MockFindOverlaps_randomExample_NonOrder(self._fileName, self._ID, self._numberOfReads, self._chromSize) + iMFO_RE.write() + cmd = 'sort -f -n -k4 -k5.4rn -o %s %s'%(self._fileName, self._fileName) + os.system(cmd) + +class MockFindOverlaps_randomExample_NonOrder(object): + + def __init__(self, fileName, ID, numberOfReads, chromSize): + self._fileName = fileName + self._ID = ID + self._numberOfReads = numberOfReads + self._chromSize = chromSize + + def write(self): + iRRG = RandomRegionsGenerator(2) + iRRG.setMinSize(36) + iRRG.setMaxSize(100) + iRRG.setGenomeSize(self._chromSize) + iRRG.setChromosomeName("chr1") + iRRG.setStrands(False) + iRRG.setNumber(self._numberOfReads) + iRRG.setOutputFile(self._fileName) + iRRG.run() + + +class MockFindOverlaps_randomExample_MOverlaps(object): + + def __init__(self, refFileName, queryFileName, overlapNumber, numberOfReads, chromSize): + self._refFileName = refFileName + self._queryFileName = queryFileName + self._overlapNumber = overlapNumber + self._numberOfReads = numberOfReads + self._chromSize = chromSize + + def createRandomExample(self): + id = 'reference' + iRSS = MockFindOverlaps_randomExample(self._refFileName, id, self._numberOfReads, self._chromSize) + iRSS.write() + self.queryWriter = TranscriptWriter(self._queryFileName , 'gff3') + totalOverlap = 0 + while totalOverlap != self._overlapNumber: + totalOverlap = 0 + i = 0 + while i < 10: + query = self.createRandomTranscript(i, id) + overlapNumber = self.getOverlapNumber(query, self._refFileName, totalOverlap) + while overlapNumber > self._overlapNumber: + query = self.createRandomTranscript(i, id) + overlapNumber = self.getOverlapNumber(query, self._refFileName, totalOverlap) + totalOverlap = overlapNumber + i += 1 + self.queryWriter.addTranscript(query) + self.queryWriter.write() + self.queryWriter.close() +# os.rename("%s.gff3" % (self._queryFileName), self._queryFileName) + + cmd = 'sort -f -n -k4 -k5.4rn -o %s %s'%(self._refFileName, self._refFileName) + os.system(cmd) + cmd = 'sort -f -n -k4 -k5.4rn -o %s %s'%(self._queryFileName, self._queryFileName) + os.system(cmd) + + def createRandomTranscript(self, cpt, id): + iRRG = RandomRegionsGenerator(2) + strand = '+' + chromosome = 'chr1' + size = random.randint(36, 100) + iRRG.setSize(size) + start = random.randint(0, 1000-size) + transcript = iRRG.createTranscript(chromosome, start, size, strand, cpt) + IDdetail = '%s_%d'%(id,cpt) + transcript.setTagValue('ID', IDdetail) + transcript.setName(IDdetail) + return transcript + + def isOverlap(self, query, ref): + if (query.getStart() <= ref.getEnd() and query.getEnd() >= ref.getStart()): + return True + else: + return False + + def getIntervalFromAdress(self, fileName, address): + iParser = GffParser(fileName) + iParser.gotoAddress(int(address)) + iTranscrit = iParser.getNextTranscript() + iParser.close() + return iTranscrit + + def getOverlapNumber(self, query, refFileName, totalOverlap): + count = totalOverlap + fRef = open(refFileName, 'r') + address = fRef.tell() + line = fRef.readline() + while line != '': + ref = self.getIntervalFromAdress(refFileName, address) + if self.isOverlap(query, ref): + count += 1 + address = fRef.tell() + line = fRef.readline() + fRef.close() + return count + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FileSorter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FileSorter.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,84 @@ +import os +import unittest +import struct +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle + + +class Test_F_FileSorter(unittest.TestCase): + + def setUp(self): + self._inputGff3FileName = 'inputFile.gff3' + self._outputFileName = 'outputFile.pkl' + + def tearDown(self): + return + for fileName in (self._inputGff3FileName, self._sortedFileName, self._expHFileName, self._expLFileName, self._obsHFileName, self._obsLFileName, self._addressFileName): + if os.path.exists(fileName): + os.remove(fileName) + + def test_unique(self): + transcript = self._createTranscript("chr1", 100, 200, "test1.1") + parser = self._writeAndSortAndParse([transcript]) + self.assertEquals(parser.getNbTranscripts(), 1) + for transcript in parser.getIterator(): + self._checkTranscript(transcript, "chr1", 100, 200, "test1.1") + + def test_simple(self): + transcript1 = self._createTranscript("chr1", 300, 400, "test1.1") + transcript2 = self._createTranscript("chr1", 100, 200, "test1.2") + parser = self._writeAndSortAndParse([transcript1, transcript2]) + self.assertEquals(parser.getNbTranscripts(), 2) + for cpt, transcript in enumerate(parser.getIterator()): + if cpt == 0: + self._checkTranscript(transcript, "chr1", 100, 200, "test1.2") + else: + self._checkTranscript(transcript, "chr1", 300, 400, "test1.1") + + def test_same_start(self): + transcript1 = self._createTranscript("chr1", 100, 200, "test1.1") + transcript2 = self._createTranscript("chr1", 100, 300, "test1.2") + parser = self._writeAndSortAndParse([transcript1, transcript2]) + self.assertEquals(parser.getNbTranscripts(), 2) + for cpt, transcript in enumerate(parser.getIterator()): + if cpt == 0: + self._checkTranscript(transcript, "chr1", 100, 300, "test1.2") + else: + self._checkTranscript(transcript, "chr1", 100, 200, "test1.1") + + def _writeAndSortAndParse(self, transcripts): + writer = Gff3Writer(self._inputGff3FileName, 0) + for transcript in transcripts: + writer.addTranscript(transcript) + writer.close() + parser = GffParser(self._inputGff3FileName, 0) + fs = FileSorter(parser, 0) + fs.setOutputFileName(self._outputFileName) + fs.sort() + parser = NCListFileUnpickle(self._outputFileName, 0) + return parser + + def _createTranscript(self, chromosome, start, end, name): + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setStart(start) + transcript.setEnd(end) + transcript.setDirection("+") + transcript.setName(name) + return transcript + + def _checkTranscript(self, transcript, chromosome, start, end, name): + self.assertEquals(transcript.getChromosome(), chromosome) + self.assertEquals(transcript.getStart(), start) + self.assertEquals(transcript.getEnd(), end) + self.assertEquals(transcript.getDirection(), 1) + self.assertEquals(transcript.getName(), name) + + +if __name__ == "__main__": + unittest.main() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FindOverlapsWithOneInterval.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FindOverlapsWithOneInterval.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,121 @@ +import unittest +import struct +import os +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.FindOverlapsWithOneInterval import FindOverlapsWithOneInterval +from SMART.Java.Python.misc import Utils + +class Test_F_FindOverlapsWithOneInterval(unittest.TestCase): + + def setUp(self): + self._inputGff3FileName = 'sortedFile.gff3' + self._writeGFF3File(self._inputGff3FileName) + self._obsFileName = "overlap.gff3" + self._expFileName = "expFile.gff3" + self._iFOWOI = FindOverlapsWithOneInterval(0) + self._iFOWOI.setFileName(self._inputGff3FileName, "gff3") + self._iFOWOI.setOutputFileName(self._obsFileName) + + def tearDown(self): + os.remove(self._inputGff3FileName) + os.remove(self._obsFileName) + os.remove(self._expFileName) + + def test_run_general(self): + self._iFOWOI.setInterval("chr1", 500, 850) + self._iFOWOI.run() + self._writeExpGFF3File_general(self._expFileName) + self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) + +# def test_run_general_asScript(self): +# cmd = 'python ../FindOverlapsWithOneInterval.py -i %s -f gff3 -o %s -c chr1 -s 500 -e 850 -v 0' % (self._inputGff3FileName, self._obsFileName) +# os.system(cmd) +# self._writeExpGFF3File_general(self._expFileName) +# self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) +# +# def test_run_one_overlap(self): +# self._iFOWOI.setInterval("chr1", 1250, 1450) +# self._iFOWOI.run() +# self._writeExpGFF3File_one_overlap(self._expFileName) +# self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) +# +# def test_run_one_overlap_asScript(self): +# cmd = 'python ../FindOverlapsWithOneInterval.py -i %s -f gff3 -o %s -c chr1 -s 1250 -e 1450 -v 0' % (self._inputGff3FileName, self._obsFileName) +# os.system(cmd) +# self._writeExpGFF3File_one_overlap(self._expFileName) +# self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) +# +# def test_run_all_overlap(self): +# self._iFOWOI.setInterval("chr1", 300, 1250) +# self._iFOWOI.run() +# self._writeExpGff3File_all_overlap(self._expFileName) +# self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) +# +# def test_run_all_overlap_asScript(self): +# cmd = 'python ../FindOverlapsWithOneInterval.py -i %s -f gff3 -o %s -c chr1 -s 300 -e 1250 -v 0' % (self._inputGff3FileName, self._obsFileName) +# os.system(cmd) +# self._writeExpGff3File_all_overlap(self._expFileName) +# self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) +# +# def test_run_no_overlap_right(self): +# self._iFOWOI.setInterval("chr1", 1400, 1500) +# self._iFOWOI.run() +# f = open(self._expFileName, "w") +# f.close() +# self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) +# +# def test_run_no_overlap_right_asScript(self): +# cmd = 'python ../FindOverlapsWithOneInterval.py -i %s -f gff3 -o %s -c chr1 -s 1400 -e 1500 -v 0' % (self._inputGff3FileName, self._obsFileName) +# os.system(cmd) +# f = open(self._expFileName, "w") +# f.close() +# self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) +# +# def test_run_no_overlap_left(self): +# self._iFOWOI.setInterval("chr1", 0, 8) +# self._iFOWOI.run() +# f = open(self._expFileName, "w") +# f.close() +# self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) +# +# def test_run_no_overlap_left_asScript(self): +# cmd = 'python ../FindOverlapsWithOneInterval.py -i %s -f gff3 -o %s -c chr1 -s 0 -e 8 -v 0' % (self._inputGff3FileName, self._obsFileName) +# os.system(cmd) +# f = open(self._expFileName, "w") +# f.close() +# self.assertTrue(Utils.diff(self._expFileName, self._obsFileName)) + + def _writeExpGff3File_all_overlap(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tID=test2.1;Name=test2.1\n") + f.write("chr1\tS-MART\ttest2.2\t50\t350\t301\t+\t.\tID=test2.2;Name=test2.2\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tID=test2.3;Name=test2.3\n") + f.write("chr1\tS-MART\ttest2.4\t200\t450\t251\t+\t.\tID=test2.4;Name=test2.4\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tID=test2.5;Name=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tID=test2.6;Name=test2.6\n") + f.write("chr1\tS-MART\ttest2.7\t1200\t1300\t101\t+\t.\tID=test2.7;Name=test2.7\n") + f.close() + + def _writeExpGFF3File_one_overlap(self, fileName): + f = open(fileName, "w") + f.write("chr1\tS-MART\ttest2.7\t1200\t1300\t101\t+\t.\tID=test2.7;Name=test2.7\n") + f.close() + + def _writeExpGFF3File_general(self, fileName): + f = open(fileName, "w") + f.write("chr1\tS-MART\ttranscript\t500\t850\t.\t+\t.\tnbOverlaps=4;overlapsWith=test2.1--test2.3--test2.5--test2.6\n") + f.close() + + def _writeGFF3File(self, fileName): + f = open(fileName, "w") + f.write("chr1\ttest\ttest2.1\t9\t1000\t1001\t+\t.\tID=test2.1;Name=test2.1\n") + f.write("chr1\ttest\ttest2.2\t50\t350\t301\t+\t.\tID=test2.2;Name=test2.2\n") + f.write("chr1\ttest\ttest2.3\t100\t600\t501\t+\t.\tID=test2.3;Name=test2.3\n") + f.write("chr1\ttest\ttest2.4\t200\t450\t251\t+\t.\tID=test2.4;Name=test2.4\n") + f.write("chr1\ttest\ttest2.5\t700\t950\t251\t+\t.\tID=test2.5;Name=test2.5\n") + f.write("chr1\ttest\ttest2.6\t800\t900\t101\t+\t.\tID=test2.6;Name=test2.6\n") + f.write("chr1\ttest\ttest2.7\t1200\t1300\t101\t+\t.\tID=test2.7;Name=test2.7\n") + f.close() + +if __name__ == "__main__": + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FindOverlapsWithSeveralIntervals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FindOverlapsWithSeveralIntervals.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,173 @@ +import unittest +import os, os.path +from SMART.Java.Python.ncList.FindOverlapsWithSeveralIntervals import FindOverlapsWithSeveralIntervals +from SMART.Java.Python.misc import Utils + +class Test_F_FindOverlapsWithSeveralIntervals(unittest.TestCase): + + def setUp(self): + self._inputRefGff3FileName = 'sorted_Ref.gff3' + self._inputQueryGff3FileName = 'sorted_Query.gff3' + self._outputGff3FileName = 'output.gff3' + self._expOutputFileName = 'expOutGff3.gff3' + self._writeQueryGff3File(self._inputQueryGff3FileName) + self._writeGFF3File(self._inputRefGff3FileName) + self._iFOWSI = FindOverlapsWithSeveralIntervals(0) + self._iFOWSI.setRefFileName(self._inputRefGff3FileName, "gff3") + self._iFOWSI.setQueryFileName(self._inputQueryGff3FileName, "gff3") + self._iFOWSI.setOutputFileName(self._outputGff3FileName) + self._iFOWSI.prepareIntermediateFiles() + self._iFOWSI.createNCLists() + + def tearDown(self): + for fileName in (self._inputRefGff3FileName, self._inputQueryGff3FileName, self._outputGff3FileName, self._expOutputFileName): + if os.path.exists(fileName): + os.remove(fileName) + + def test_run_general(self): + self._writeQueryGff3File(self._inputQueryGff3FileName) + self._writeGFF3File(self._inputRefGff3FileName) + self._iFOWSI = FindOverlapsWithSeveralIntervals(0) + self._iFOWSI.setRefFileName(self._inputRefGff3FileName, "gff3") + self._iFOWSI.setQueryFileName(self._inputQueryGff3FileName, "gff3") + self._iFOWSI.setOutputFileName(self._outputGff3FileName) + self._iFOWSI.prepareIntermediateFiles() + self._iFOWSI.createNCLists() + self._iFOWSI.compare() + self._iFOWSI.close() + self._writeExpOutFile_general(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + + def test_run_general_asScript(self): + cmd = 'python ../FindOverlapsWithSeveralIntervals.py -i %s -f gff3 -j %s -g gff3 -o %s -v 0' % (self._inputQueryGff3FileName, self._inputRefGff3FileName, self._outputGff3FileName) + os.system(cmd) + self._writeExpOutFile_general(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + + + def test_run_overlap_special_case(self): + inputQueryGff3FileName = 'query2.gff3' + self._writeQueryGff3File2(inputQueryGff3FileName) + self._iFOWSI = FindOverlapsWithSeveralIntervals(0) + self._iFOWSI.setRefFileName(self._inputRefGff3FileName, "gff3") + self._iFOWSI.setQueryFileName(inputQueryGff3FileName, "gff3") + self._iFOWSI.setOutputFileName(self._outputGff3FileName) + self._iFOWSI.prepareIntermediateFiles() + self._iFOWSI.createNCLists() + self._iFOWSI.compare() + self._iFOWSI.close() + self._writeExpOutFile_special_case(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + + def test_run_overlap_special_case_asScript(self): + inputQueryGff3FileName = 'query2.gff3' + self._writeQueryGff3File2(inputQueryGff3FileName) + cmd = 'python ../FindOverlapsWithSeveralIntervals.py -i %s -f gff3 -j %s -g gff3 -o %s -v 0' % (inputQueryGff3FileName, self._inputRefGff3FileName, self._outputGff3FileName) + os.system(cmd) + self._writeExpOutFile_special_case(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + + def _writeExpOutFile_special_case(self, fileName): + f = open(fileName, 'w') + f.write("chr1 S-MART test2 1250 1300 781 + . nbOverlaps=1;overlapsWith=test2.7;ID=query_2;Name=test1.2\n") + f.close() + + def _writeExpOutFile_general(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test1.1 25 150 126 + . nbOverlaps=3;overlapsWith=test2.1--test2.2--test2.3;ID=query_1;Name=test1.1 +chr1 S-MART test1.2 70 850 781 + . nbOverlaps=6;overlapsWith=test2.1--test2.2--test2.3--test2.4--test2.5--test2.6;ID=query_2;Name=test1.2 +chr1 S-MART test1.3 550 850 201 + . nbOverlaps=4;overlapsWith=test2.1--test2.3--test2.5--test2.6;ID=query_3;Name=test1.3 +chr1 S-MART test1.4 925 1025 101 + . nbOverlaps=2;overlapsWith=test2.1--test2.5;ID=query_4;Name=test1.4 +chr1 S-MART test1.5 1201 1210 10 + . nbOverlaps=1;overlapsWith=test2.7;ID=query_5;Name=test1.5 +""") + f.close() + + def _writeExpOutFile_cas_1(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tName=test2.1;OverlapWith=query_3;score=1001;feature=test2.1;ID=test2.1\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tName=test2.3;OverlapWith=query_3;score=501;feature=test2.3;ID=test2.3\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tName=test2.5;OverlapWith=query_3;score=251;feature=test2.5;ID=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tName=test2.6;OverlapWith=query_3;score=101;feature=test2.6;ID=test2.6\n") + f.close() + + def _writeExpOutFile_cas_2(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tName=test2.1;OverlapWith=query_2;score=1001;feature=test2.1;ID=test2.1\n") + f.write("chr1\tS-MART\ttest2.2\t50\t350\t301\t+\t.\tName=test2.2;OverlapWith=query_2;score=301;feature=test2.2;ID=test2.2\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tName=test2.3;OverlapWith=query_2;score=501;feature=test2.3;ID=test2.3\n") + f.write("chr1\tS-MART\ttest2.4\t200\t450\t251\t+\t.\tName=test2.4;OverlapWith=query_2;score=251;feature=test2.4;ID=test2.4\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tName=test2.5;OverlapWith=query_2;score=251;feature=test2.5;ID=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tName=test2.6;OverlapWith=query_2;score=101;feature=test2.6;ID=test2.6\n") + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tName=test2.1;OverlapWith=query_3;score=1001;feature=test2.1;ID=test2.1\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tName=test2.3;OverlapWith=query_3;score=501;feature=test2.3;ID=test2.3\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tName=test2.5;OverlapWith=query_3;score=251;feature=test2.5;ID=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tName=test2.6;OverlapWith=query_3;score=101;feature=test2.6;ID=test2.6\n") + f.close() + + def _writeExpOutFile_all_overlap(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tName=test2.1;OverlapWith=query_2;score=1001;feature=test2.1;ID=test2.1\n") + f.write("chr1\tS-MART\ttest2.2\t50\t350\t301\t+\t.\tName=test2.2;OverlapWith=query_2;score=301;feature=test2.2;ID=test2.2\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tName=test2.3;OverlapWith=query_2;score=501;feature=test2.3;ID=test2.3\n") + f.write("chr1\tS-MART\ttest2.4\t200\t450\t251\t+\t.\tName=test2.4;OverlapWith=query_2;score=251;feature=test2.4;ID=test2.4\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tName=test2.5;OverlapWith=query_2;score=251;feature=test2.5;ID=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tName=test2.6;OverlapWith=query_2;score=101;feature=test2.6;ID=test2.6\n") + f.close() + + def _writeExpOutFile_overlap_to_children(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tName=test2.1;OverlapWith=query_3;score=1001;feature=test2.1;ID=test2.1\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tName=test2.3;OverlapWith=query_3;score=501;feature=test2.3;ID=test2.3\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tName=test2.5;OverlapWith=query_3;score=251;feature=test2.5;ID=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tName=test2.6;OverlapWith=query_3;score=101;feature=test2.6;ID=test2.6\n") + f.close() + + def _writeExpOutFile_not_overlap_to_children(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tName=test2.1;OverlapWith=query_1;score=1001;feature=test2.1;ID=test2.1\n") + f.write("chr1\tS-MART\ttest2.2\t50\t350\t301\t+\t.\tName=test2.2;OverlapWith=query_1;score=301;feature=test2.2;ID=test2.2\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tName=test2.3;OverlapWith=query_1;score=501;feature=test2.3;ID=test2.3\n") + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tName=test2.1;OverlapWith=query_4;score=1001;feature=test2.1;ID=test2.1\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tName=test2.5;OverlapWith=query_4;score=251;feature=test2.5;ID=test2.5\n") + f.close() + + def _writeExpOutFile_no_overlap_right(self, fileName): + f = open(fileName, 'w') + f.close() + + def _writeExpOutFile_one_overlap(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.7\t1200\t1300\t101\t+\t.\tName=test2.7;OverlapWith=query_5;score=101;feature=test2.7;ID=test2.7\n") + f.close() + + def _writeQueryGff3File2(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest1\t1100\t1150\t126\t+\t.\tID=query_1;Name=test1.1\n") + f.write("chr1\tquery\ttest2\t1250\t1300\t781\t+\t.\tID=query_2;Name=test1.2\n") + f.close() + + def _writeQueryGff3File(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest1.1\t25\t150\t126\t+\t.\tID=query_1;Name=test1.1\n") + f.write("chr1\tquery\ttest1.2\t70\t850\t781\t+\t.\tID=query_2;Name=test1.2\n") + f.write("chr1\tquery\ttest1.3\t550\t850\t201\t+\t.\tID=query_3;Name=test1.3\n") + f.write("chr1\tquery\ttest1.4\t925\t1025\t101\t+\t.\tID=query_4;Name=test1.4\n") + f.write("chr1\tquery\ttest1.5\t1201\t1210\t10\t+\t.\tID=query_5;Name=test1.5\n") + f.write("chr1\tquery\ttest1.6\t1500\t1600\t101\t+\t.\tID=query_6;Name=test1.6\n") + f.close() + + def _writeGFF3File(self, fileName): + f = open(fileName, "w") + f.write("chr1\ttest\ttest2.1\t9\t1000\t1001\t+\t.\tID=test2.1;Name=test2.1\n") + f.write("chr1\ttest\ttest2.2\t50\t350\t301\t+\t.\tID=test2.2;Name=test2.2\n") + f.write("chr1\ttest\ttest2.3\t100\t600\t501\t+\t.\tID=test2.3;Name=test2.3\n") + f.write("chr1\ttest\ttest2.4\t200\t450\t251\t+\t.\tID=test2.4;Name=test2.4\n") + f.write("chr1\ttest\ttest2.5\t700\t950\t251\t+\t.\tID=test2.5;Name=test2.5\n") + f.write("chr1\ttest\ttest2.6\t800\t900\t101\t+\t.\tID=test2.6;Name=test2.6\n") + f.write("chr1\ttest\ttest2.7\t1200\t1300\t101\t+\t.\tID=test2.7;Name=test2.7\n") + f.close() + +if __name__ == "__main__": + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FindOverlaps_naif.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FindOverlaps_naif.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,455 @@ +import unittest +import os +from commons.core.utils.FileUtils import FileUtils +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.ncList.FindOverlaps_naif import FindOverlaps_naif +from SMART.Java.Python.ncList.test.MockFindOverlapsWithSeveralIntervals import * + +class Test_F_FindOverlaps_naif(unittest.TestCase): + + def setUp(self): + self._inputRefGff3FileName = 'ref.gff3' + self._writeGFF3File(self._inputRefGff3FileName) + self._inputQueryGff3FileName = 'query.gff3' + self._writeQueryGff3File(self._inputQueryGff3FileName) + self._outputGff3FileName = 'output.gff3' + self._expOutputFileName = 'expOutGff3.gff3' + self._iFON = FindOverlaps_naif(self._inputRefGff3FileName, self._inputQueryGff3FileName) + self._iFON.setOutputGff3FileName(self._outputGff3FileName) + + def tearDown(self): + os.remove(self._inputRefGff3FileName) + os.remove(self._inputQueryGff3FileName) + os.remove(self._outputGff3FileName) + os.remove(self._expOutputFileName) + + def test_run_general(self): + self._iFON.run() + self._iFON.close() + self._writeExpOutFile_general(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + + def test_run_general_asScript(self): + cmd = 'python ../FindOverlaps_naif.py -i %s -j %s -o %s' % (self._inputRefGff3FileName, self._inputQueryGff3FileName, self._outputGff3FileName) + os.system(cmd) + self._writeExpOutFile_general(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + + def test_run_overlap_special_case(self): + inputQueryGff3FileName = 'query2.gff3' + self._writeQueryGff3File2(inputQueryGff3FileName) + iFON = FindOverlaps_naif(self._inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_special_case(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + + def test_run_overlap_special_case_asScript(self): + inputQueryGff3FileName = 'query2.gff3' + self._writeQueryGff3File2(inputQueryGff3FileName) + cmd = 'python ../FindOverlaps_naif.py -i %s -j %s -o %s' % (self._inputRefGff3FileName, inputQueryGff3FileName, self._outputGff3FileName) + os.system(cmd) + self._writeExpOutFile_special_case(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + + def test_case_2(self): + inputRefGff3FileName = 'ref_case2.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case2() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case2.gff3' + self._writeQueryGff3File_case2(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case2(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def test_case_3(self): + inputRefGff3FileName = 'ref_case3.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case3() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case3.gff3' + self._writeQueryGff3File_case3(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case3(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def test_case_4(self): + inputRefGff3FileName = 'ref_case4.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case4_5() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case4.gff3' + self._writeQueryGff3File_case4(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case4(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def test_case_5(self): + inputRefGff3FileName = 'ref_case5.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case4_5() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case5.gff3' + self._writeQueryGff3File_case5(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case5(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def test_case_6(self): + inputRefGff3FileName = 'ref_case6.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case6_7() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case6.gff3' + self._writeQueryGff3File_case6(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case6(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def test_case_7(self): + inputRefGff3FileName = 'ref_case7.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case6_7() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case7.gff3' + self._writeQueryGff3File_case7(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case7(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def test_case_8(self): + inputRefGff3FileName = 'ref_case8.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case8() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case8.gff3' + self._writeQueryGff3File_case8(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case8(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def test_case_9(self): + inputRefGff3FileName = 'ref_case9.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case9() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case9.gff3' + self._writeQueryGff3File_case9(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case9(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def test_case_10(self): + inputRefGff3FileName = 'ref_case10.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case10() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case10.gff3' + self._writeQueryGff3File_case10(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case10(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def test_case_11(self): + inputRefGff3FileName = 'ref_case11.gff3' + iMock = MockFindOverlapsWithServeralIntervals_case11() + iMock.write(inputRefGff3FileName) + inputQueryGff3FileName = 'query_case11.gff3' + self._writeQueryGff3File_case11(inputQueryGff3FileName) + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._outputGff3FileName) + iFON.run() + iFON.close() + self._writeExpOutFile_case11(self._expOutputFileName) + self.assertTrue(Utils.diff(self._expOutputFileName, self._outputGff3FileName)) + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + + def _writeExpOutFile_special_case(self, fileName): + f = open(fileName, 'w') + f.write("chr1 S-MART test2 1250 1300 781 + . nbOverlaps=1;overlapsWith=test2.7;ID=query_2;Name=test1.2\n") + f.close() + + def _writeExpOutFile_general(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test1.1 25 150 126 + . nbOverlaps=3;overlapsWith=test2.1--test2.2--test2.3;ID=query_1;Name=test1.1 +chr1 S-MART test1.2 70 850 781 + . nbOverlaps=6;overlapsWith=test2.1--test2.2--test2.3--test2.4--test2.5--test2.6;ID=query_2;Name=test1.2 +chr1 S-MART test1.3 550 850 201 + . nbOverlaps=4;overlapsWith=test2.1--test2.3--test2.5--test2.6;ID=query_3;Name=test1.3 +chr1 S-MART test1.4 925 1025 101 + . nbOverlaps=2;overlapsWith=test2.1--test2.5;ID=query_4;Name=test1.4 +chr1 S-MART test1.5 1201 1210 10 + . nbOverlaps=1;overlapsWith=test2.7;ID=query_5;Name=test1.5 +""") + f.close() + + def _writeExpOutFile_cas_1(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tOverlapWith=query_3;ID=test2.1;Name=test2.1\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tOverlapWith=query_3;ID=test2.3;Name=test2.3\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tOverlapWith=query_3;ID=test2.5;Name=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tOverlapWith=query_3;ID=test2.6;Name=test2.6\n") + f.close() + + def _writeExpOutFile_cas_2(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tOverlapWith=query_2;Name=test2.1\n") + f.write("chr1\tS-MART\ttest2.2\t50\t350\t301\t+\t.\tOverlapWith=query_2;Name=test2.2\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tOverlapWith=query_2;Name=test2.3\n") + f.write("chr1\tS-MART\ttest2.4\t200\t450\t251\t+\t.\tOverlapWith=query_2;Name=test2.4\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tOverlapWith=query_2;Name=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tOverlapWith=query_2;Name=test2.6\n") + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tOverlapWith=query_3;Name=test2.1\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tOverlapWith=query_3;Name=test2.3\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tOverlapWith=query_3;Name=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tOverlapWith=query_3;Name=test2.6\n") + f.close() + + def _writeExpOutFile_all_overlap(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tOverlapWith=query_2;ID=test2.1;Name=test2.1\n") + f.write("chr1\tS-MART\ttest2.2\t50\t350\t301\t+\t.\tOverlapWith=query_2;D=test2.2;Name=test2.2\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tOverlapWith=query_2;ID=test2.3;Name=test2.3\n") + f.write("chr1\tS-MART\ttest2.4\t200\t450\t251\t+\t.\tOverlapWith=query_2;ID=test2.4;Name=test2.4\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tOverlapWith=query_2;ID=test2.5;Name=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tOverlapWith=query_2;ID=test2.6;Name=test2.6\n") + f.close() + + def _writeExpOutFile_overlap_to_children(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tOverlapWith=query_3;ID=test2.1;Name=test2.1\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tOverlapWith=query_3;ID=test2.3;Name=test2.3\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tOverlapWith=query_3;ID=test2.5;Name=test2.5\n") + f.write("chr1\tS-MART\ttest2.6\t800\t900\t101\t+\t.\tOverlapWith=query_3;ID=test2.6;Name=test2.6\n") + f.close() + + def _writeExpOutFile_not_overlap_to_children(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tOverlapWith=query_1;ID=test2.1;Name=test2.1\n") + f.write("chr1\tS-MART\ttest2.2\t50\t350\t301\t+\t.\tOverlapWith=query_1;ID=test2.2;Name=test2.2\n") + f.write("chr1\tS-MART\ttest2.3\t100\t600\t501\t+\t.\tOverlapWith=query_1;ID=test2.3;Name=test2.3\n") + f.write("chr1\tS-MART\ttest2.1\t9\t1000\t1001\t+\t.\tOverlapWith=query_4;ID=test2.1;Name=test2.1\n") + f.write("chr1\tS-MART\ttest2.5\t700\t950\t251\t+\t.\tOverlapWith=query_4;ID=test2.5;Name=test2.5\n") + f.close() + + def _writeExpOutFile_no_overlap_right(self, fileName): + f = open(fileName, 'w') + f.close() + + def _writeExpOutFile_one_overlap(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tS-MART\ttest2.7\t1200\t1300\t101\t+\t.\tOverlapWith=query_5;ID=test2.7;Name=test2.7\n") + f.close() + + def _writeExpOutFile_case2(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test2.1 150 300 151 + . nbOverlaps=4;overlapsWith=test2.1--test2.2--test2.3--test2.4;ID=query_1;Name=test2.1 +chr1 S-MART test2.2 300 450 781 + . nbOverlaps=3;overlapsWith=test2.1--test2.2--test2.3;ID=query_2;Name=test2.2 +chr1 S-MART test2.3 480 800 321 + . nbOverlaps=1;overlapsWith=test2.1;ID=query_3;Name=test2.3 +chr1 S-MART test2.5 850 1000 151 + . nbOverlaps=1;overlapsWith=test2.5;ID=query_5;Name=test2.5 +""") + f.close() + + def _writeExpOutFile_case3(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test3.1 150 250 101 + . nbOverlaps=4;overlapsWith=test3.1--test3.2--test3.3--test3.4;ID=query_1;Name=test3.1 +chr1 S-MART test3.2 380 400 21 + . nbOverlaps=4;overlapsWith=test3.1--test3.2--test3.3--test3.5;ID=query_2;Name=test3.2 +chr1 S-MART test3.3 480 520 41 + . nbOverlaps=1;overlapsWith=test3.1;ID=query_3;Name=test3.3 +chr1 S-MART test3.5 900 950 51 + . nbOverlaps=1;overlapsWith=test3.6;ID=query_5;Name=test3.5 +""") + f.close() + + def _writeExpOutFile_case4(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test4.1 400 500 101 + . nbOverlaps=3;overlapsWith=test4.1--test4.2--test4.3;ID=query_1;Name=test4.1 +chr1 S-MART test4.2 450 600 151 + . nbOverlaps=3;overlapsWith=test4.1--test4.2--test4.3;ID=query_2;Name=test4.2 +chr1 S-MART test4.3 700 800 101 + . nbOverlaps=2;overlapsWith=test4.1--test4.2;ID=query_3;Name=test4.3 +""") + f.close() + + def _writeExpOutFile_case5(self, fileName): + f = open(fileName, 'w') + f.write("chr1 S-MART test5.1 850 950 101 + . nbOverlaps=1;overlapsWith=test4.1;ID=query_1;Name=test5.1\n") + f.close() + + def _writeExpOutFile_case6(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test6.1 200 300 101 + . nbOverlaps=2;overlapsWith=test6.1--test6.2;ID=query_1;Name=test6.1 +chr1 S-MART test6.2 800 900 101 + . nbOverlaps=2;overlapsWith=test6.1--test6.5;ID=query_2;Name=test6.2 +""") + f.close() + + def _writeExpOutFile_case7(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test7.1 530 550 21 + . nbOverlaps=1;overlapsWith=test6.1;ID=query_1;Name=test7.1 +chr1 S-MART test7.2 600 700 101 + . nbOverlaps=1;overlapsWith=test6.1;ID=query_2;Name=test7.2 +chr1 S-MART test7.3 650 900 251 + . nbOverlaps=2;overlapsWith=test6.1--test6.5;ID=query_3;Name=test7.3 +""") + f.close() + + def _writeExpOutFile_case8(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test8.1 500 600 101 + . nbOverlaps=1;overlapsWith=test8.1;ID=query_1;Name=test8.1 +chr1 S-MART test8.2 700 800 101 + . nbOverlaps=1;overlapsWith=test8.1;ID=query_2;Name=test8.2 +chr1 S-MART test8.3 900 1100 201 + . nbOverlaps=1;overlapsWith=test8.1;ID=query_3;Name=test8.3 +""") + f.close() + + def _writeExpOutFile_case9(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test9.1 400 500 101 + . nbOverlaps=1;overlapsWith=test9.1;ID=query_1;Name=test9.1 +chr1 S-MART test9.2 550 650 101 + . nbOverlaps=2;overlapsWith=test9.1--test9.2;ID=query_2;Name=test9.2 +""") + f.close() + + def _writeExpOutFile_case10(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test10.1 700 800 101 + . nbOverlaps=1;overlapsWith=test10.1;ID=query_1;Name=test10.1 +chr1 S-MART test10.2 900 1000 101 + . nbOverlaps=1;overlapsWith=test10.1;ID=query_2;Name=test10.2 +chr1 S-MART test10.3 1100 1300 201 + . nbOverlaps=1;overlapsWith=test10.5;ID=query_3;Name=test10.3 +""") + f.close() + + def _writeExpOutFile_case11(self, fileName): + f = open(fileName, 'w') + f.write("""chr1 S-MART test11.1 420 480 61 + . nbOverlaps=1;overlapsWith=test11.1;ID=query_1;Name=test11.1 +chr1 S-MART test11.2 450 715 266 + . nbOverlaps=3;overlapsWith=test11.1--test11.4--test11.5;ID=query_2;Name=test11.2 +""") + f.close() + + def _writeQueryGff3File2(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest1\t1100\t1150\t126\t+\t.\tID=query_1;Name=test1.1\n") + f.write("chr1\tquery\ttest2\t1250\t1300\t781\t+\t.\tID=query_2;Name=test1.2\n") + f.close() + + def _writeQueryGff3File(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest1.1\t25\t150\t126\t+\t.\tID=query_1;Name=test1.1\n") + f.write("chr1\tquery\ttest1.2\t70\t850\t781\t+\t.\tID=query_2;Name=test1.2\n") + f.write("chr1\tquery\ttest1.3\t550\t850\t201\t+\t.\tID=query_3;Name=test1.3\n") + f.write("chr1\tquery\ttest1.4\t925\t1025\t101\t+\t.\tID=query_4;Name=test1.4\n") + f.write("chr1\tquery\ttest1.5\t1201\t1210\t10\t+\t.\tID=query_5;Name=test1.5\n") + f.write("chr1\tquery\ttest1.6\t1500\t1600\t101\t+\t.\tID=query_6;Name=test1.6\n") + f.close() + + def _writeQueryGff3File_case2(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest2.1\t150\t300\t151\t+\t.\tID=query_1;Name=test2.1\n") + f.write("chr1\tquery\ttest2.2\t300\t450\t781\t+\t.\tID=query_2;Name=test2.2\n") + f.write("chr1\tquery\ttest2.3\t480\t800\t321\t+\t.\tID=query_3;Name=test2.3\n") + f.write("chr1\tquery\ttest2.4\t560\t800\t241\t+\t.\tID=query_4;Name=test2.4\n") + f.write("chr1\tquery\ttest2.5\t850\t1000\t151\t+\t.\tID=query_5;Name=test2.5\n") + f.close() + + def _writeQueryGff3File_case3(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest3.1\t150\t250\t101\t+\t.\tID=query_1;Name=test3.1\n") + f.write("chr1\tquery\ttest3.2\t380\t400\t21\t+\t.\tID=query_2;Name=test3.2\n") + f.write("chr1\tquery\ttest3.3\t480\t520\t41\t+\t.\tID=query_3;Name=test3.3\n") + f.write("chr1\tquery\ttest3.4\t510\t700\t191\t+\t.\tID=query_4;Name=test3.4\n") + f.write("chr1\tquery\ttest3.5\t900\t950\t51\t+\t.\tID=query_5;Name=test3.5\n") + f.close() + + def _writeQueryGff3File_case4(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest4.1\t400\t500\t101\t+\t.\tID=query_1;Name=test4.1\n") + f.write("chr1\tquery\ttest4.2\t450\t600\t151\t+\t.\tID=query_2;Name=test4.2\n") + f.write("chr1\tquery\ttest4.3\t700\t800\t101\t+\t.\tID=query_3;Name=test4.3\n") + f.close() + + def _writeQueryGff3File_case5(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest5.1\t850\t950\t101\t+\t.\tID=query_1;Name=test5.1\n") + f.close() + + def _writeQueryGff3File_case6(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest6.1\t200\t300\t101\t+\t.\tID=query_1;Name=test6.1\n") + f.write("chr1\tquery\ttest6.2\t800\t900\t101\t+\t.\tID=query_2;Name=test6.2\n") + f.close() + + def _writeQueryGff3File_case7(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest7.1\t530\t550\t21\t+\t.\tID=query_1;Name=test7.1\n") + f.write("chr1\tquery\ttest7.2\t600\t700\t101\t+\t.\tID=query_2;Name=test7.2\n") + f.write("chr1\tquery\ttest7.3\t650\t900\t251\t+\t.\tID=query_3;Name=test7.3\n") + f.close() + + def _writeQueryGff3File_case8(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest8.1\t500\t600\t101\t+\t.\tID=query_1;Name=test8.1\n") + f.write("chr1\tquery\ttest8.2\t700\t800\t101\t+\t.\tID=query_2;Name=test8.2\n") + f.write("chr1\tquery\ttest8.3\t900\t1100\t201\t+\t.\tID=query_3;Name=test8.3\n") + f.write("chr1\tquery\ttest8.4\t1200\t1300\t101\t+\t.\tID=query_4;Name=test8.4\n") + f.close() + + def _writeQueryGff3File_case9(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest9.1\t400\t500\t101\t+\t.\tID=query_1;Name=test9.1\n") + f.write("chr1\tquery\ttest9.2\t550\t650\t101\t+\t.\tID=query_2;Name=test9.2\n") + f.close() + + def _writeQueryGff3File_case10(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest10.1\t700\t800\t101\t+\t.\tID=query_1;Name=test10.1\n") + f.write("chr1\tquery\ttest10.2\t900\t1000\t101\t+\t.\tID=query_2;Name=test10.2\n") + f.write("chr1\tquery\ttest10.3\t1100\t1300\t201\t+\t.\tID=query_3;Name=test10.3\n") + f.close() + + def _writeQueryGff3File_case11(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest11.1\t420\t480\t61\t+\t.\tID=query_1;Name=test11.1\n") + f.write("chr1\tquery\ttest11.2\t450\t715\t266\t+\t.\tID=query_2;Name=test11.2\n") + f.close() + + def _writeGFF3File(self, fileName): + f = open(fileName, "w") + f.write("chr1\ttest\ttest2.1\t9\t1000\t1001\t+\t.\tID=test2.1;Name=test2.1\n") + f.write("chr1\ttest\ttest2.2\t50\t350\t301\t+\t.\tID=test2.2;Name=test2.2\n") + f.write("chr1\ttest\ttest2.3\t100\t600\t501\t+\t.\tID=test2.3;Name=test2.3\n") + f.write("chr1\ttest\ttest2.4\t200\t450\t251\t+\t.\tID=test2.4;Name=test2.4\n") + f.write("chr1\ttest\ttest2.5\t700\t950\t251\t+\t.\tID=test2.5;Name=test2.5\n") + f.write("chr1\ttest\ttest2.6\t800\t900\t101\t+\t.\tID=test2.6;Name=test2.6\n") + f.write("chr1\ttest\ttest2.7\t1200\t1300\t101\t+\t.\tID=test2.7;Name=test2.7\n") + f.close() + +if __name__ == "__main__": + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FindOverlaps_randomExample.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_F_FindOverlaps_randomExample.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,48 @@ +import unittest +import os +import time +from commons.core.utils.FileUtils import FileUtils +from SMART.Java.Python.ncList.test.MockFindOverlaps_randomExample import MockFindOverlaps_randomExample_NonOrder +from SMART.Java.Python.ncList.FindOverlaps_naif import FindOverlaps_naif +from SMART.Java.Python.FindOverlapsOptim import FindOverlapsOptim + +class Test_F_FindOverlaps_randomExample(unittest.TestCase): + + def setUp(self): + self._output_optim = 'output_optim.gff3' + + def test_FindOverlaps_NonOrder(self): + inputRefGff3FileName = 'refMOverlaps.gff3' + inputQueryGff3FileName = 'queryMOverlaps.gff3' + outputDataName = 'timeResult.dat' + fTime = open(outputDataName, 'w') + fTime.write('NbRef\tNbQuery\tNbOverlap\ttime\n') + numberOfRefReads = 10 + chromSize = 100000 + numberOfQReads = 10 + print 'ref size = %d, query size = %d' %(numberOfRefReads, numberOfQReads) + iMFOR_ref = MockFindOverlaps_randomExample_NonOrder(inputRefGff3FileName, 'ref', numberOfRefReads, chromSize) + iMFOR_ref.write() + iMFOR_query = MockFindOverlaps_randomExample_NonOrder(inputQueryGff3FileName,'q', numberOfQReads, chromSize) + iMFOR_query.write() + iFOO = FindOverlapsOptim(0) + iFOO.setRefFileName(inputRefGff3FileName, "gff3") + iFOO.setQueryFileName(inputQueryGff3FileName, "gff3") + iFOO.setOutputFileName(self._output_optim) + startTime_optim = time.time() + iFOO.run() + iFOO.close() + nbOverlap = iFOO._nbOverlaps + endTime_optim = time.time() + totalTime_optim = endTime_optim - startTime_optim + print 'we take %s second.' % (totalTime_optim) + fTime.write('%d\t%d\t%d\t%.2f\n'%(numberOfRefReads, numberOfQReads, nbOverlap, totalTime_optim)) + fTime.close() + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + os.remove(self._output_optim) + os.remove(outputDataName) + + +if __name__ == "__main__": + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_F_NCList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_F_NCList.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,302 @@ +import os +import unittest +import struct +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.misc import Utils +from commons.core.utils.FileUtils import FileUtils +from SMART.Java.Python.ncList.test.MockFindOverlapsWithSeveralIntervals import * +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.ncList.FileSorter import FileSorter + +class Test_F_NCList(unittest.TestCase): + + def setUp(self): + self._inputGff3FileName = 'sortedFile.gff3' + self._sortedFileName = 'sortedFile.pkl' + self._expHFileName = 'expH.bin' + self._expLFileName = 'expL.bin' + self._obsHFileName = 'H.bin' + self._obsLFileName = 'L.bin' + self._addressFileName = 'address.txt' + self._writeGFF3File(self._inputGff3FileName) + self._ncList = NCList(0) + self._ncList.setChromosome("chr1") + + def tearDown(self): + return + for fileName in (self._inputGff3FileName, self._sortedFileName, self._expHFileName, self._expLFileName, self._obsHFileName, self._obsLFileName, self._addressFileName): + if os.path.exists(fileName): + os.remove(fileName) + + def _sortAndBuild(self): + parser = GffParser(self._inputGff3FileName) + fs = FileSorter(parser, 0) + fs.setOutputFileName(self._sortedFileName) + fs.sort() + self._ncList.setFileName(self._sortedFileName) + self._ncList.setNbElements(parser.getNbTranscripts()) + self._ncList.buildLists() + + def test_run_with_one_elementSubList(self): + iMock = MockFindOverlapsWithOneInterval() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFile_one_elementSubList() + self._writeExpLFile_one_elementSubList() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case1(self): + iMock = MockFindOverlapsWithServeralIntervals_case1() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase1() + self._writeExpLFileCase1() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case2(self): + iMock = MockFindOverlapsWithServeralIntervals_case2() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase2() + self._writeExpLFileCase2() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case3(self): + iMock = MockFindOverlapsWithServeralIntervals_case3() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase3() + self._writeExpLFileCase3() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case4_5(self): + iMock = MockFindOverlapsWithServeralIntervals_case4_5() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase4_5() + self._writeExpLFileCase4_5() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case6_7(self): + iMock = MockFindOverlapsWithServeralIntervals_case6_7() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase6_7() + self._writeExpLFileCase6_7() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case8(self): + iMock = MockFindOverlapsWithServeralIntervals_case8() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase8() + self._writeExpLFileCase8() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case9(self): + iMock = MockFindOverlapsWithServeralIntervals_case9() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase9() + self._writeExpLFileCase9() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case10(self): + iMock = MockFindOverlapsWithServeralIntervals_case10() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase10() + self._writeExpLFileCase10() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case11(self): + iMock = MockFindOverlapsWithServeralIntervals_case11() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase11() + self._writeExpLFileCase11() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def test_case12(self): + iMock = MockFindOverlapsWithServeralIntervals_case12() + iMock.write(self._inputGff3FileName) + self._sortAndBuild() + self._writeExpHFileCase12() + self._writeExpLFileCase12() + self.assertTrue(FileUtils.are2FilesIdentical(self._expHFileName, self._ncList._hFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expLFileName, self._ncList._lFileName)) + + def _writeGFF3File(self, fileName): + f = open(fileName, "w") + f.write("chr1\ttest\ttest2.1\t9\t1000\t1001\t+\t.\tID=test2.1;Name=test2.1\n") + f.write("chr1\ttest\ttest2.2\t50\t350\t301\t+\t.\tID=test2.2;Name=test2.2\n") + f.write("chr1\ttest\ttest2.3\t100\t600\t501\t+\t.\tID=test2.3;Name=test2.3\n") + f.write("chr1\ttest\ttest2.4\t200\t450\t251\t+\t.\tID=test2.4;Name=test2.4\n") + f.write("chr1\ttest\ttest2.5\t700\t950\t251\t+\t.\tID=test2.5;Name=test2.5\n") + f.write("chr1\ttest\ttest2.6\t800\t900\t101\t+\t.\tID=test2.6;Name=test2.6\n") + f.write("chr1\ttest\ttest2.7\t1200\t1300\t101\t+\t.\tID=test2.7;Name=test2.7\n") + f.close() + + def _writeBinFile(self, fileName, elements): + handle = open(fileName, "wb") + for element in elements: + handle.write(struct.pack('l', element)) + handle.close() + + def _writeExpHFile(self, HFileName): + list = [0, 2, 48, 3, -1, -1, -1, -1, 120, 1, 144, 1, -1, -1, -1, -1] + H = open(HFileName, 'wb') + + for item in list: + item = struct.pack('l', item) + H.write(item) + H.close() + + def _writeExpHFile_with_empty_SubParentDict(self, HFileName): + list = [0, 1, -1, -1] + H = open(HFileName, 'wb') + + for item in list: + item = struct.pack('l', item) + H.write(item) + H.close() + + def _writeExpHFile_one_elementSubList(self): + elements = [0, 1] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase1(self): + elements = [0, 2, 2, 3, 5, 1, 6, 1] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase2(self): + elements = [0, 2, 2, 1, 3, 1, 4, 1] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase3(self): + elements = [0, 2, 2, 1, 3, 1, 4, 2] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase4_5(self): + elements = [0, 1, 1, 1, 2, 1] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase6_7(self): + elements = [0, 1, 1, 4] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase8(self): + elements = [0, 1, 1, 2] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase9(self): + elements = [0, 2, 2, 1] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase10(self): + elements = [0, 3, 3, 3] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase11(self): + elements = [0, 2, 2, 2, 4, 2] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpHFileCase12(self): + elements = [0, 1, 1, 3, 4, 1] + self._writeBinFile(self._expHFileName, elements) + + def _writeExpLFile_one_elementSubList(self): + elements = [0, 1000, 0, -1, -1] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase1(self): + elements = [ 0, 1000, 0, 1, -1, \ + 1200, 1300, 2345, -1, -1, \ + 50, 350, 391, -1, 0, \ + 100, 600, 781, 2, 0, \ + 700, 950, 1563, 3, 0, \ + 200, 450, 1172, -1, 3, \ + 800, 900, 1954, -1, 4] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase2(self): + elements = [ 0, 500, 0, 1, -1, \ + 900, 1200, 1561, -1, -1, \ + 50, 450, 389, 2, 0, \ + 100, 400, 779, 3, 2, \ + 100, 200, 1170, -1, 3] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase3(self): + elements = [ 0, 500, 0, 1, -1, \ + 800, 1000, 1952, -1, -1, \ + 50, 450, 389, 2, 0, \ + 100, 400, 779, 3, 2, \ + 100, 200, 1170, -1, 3, \ + 300, 400, 1561, -1, 3] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase4_5(self): + elements = [ 0, 1000, 0, 1, -1, \ + 200, 800, 391, 2, 0, \ + 400, 600, 782, -1, 1] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase6_7(self): + elements = [ 0, 1000, 0, 1, -1, \ + 100, 300, 391, -1, 0, \ + 400, 500, 782, -1, 0, \ + 510, 520, 1173, -1, 0, \ + 850, 950, 1563, -1, 0] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase8(self): + elements = [ 0, 1000, 0, 1, -1, \ + 100, 200, 391, -1, 0, \ + 300, 400, 782, -1, 0] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase9(self): + elements = [ 0, 1000, 0, 1, -1, \ + 800, 1200, 782, -1, -1, \ + 600, 700, 391, -1, 0] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase10(self): + elements = [ 0, 1000, 0, 1, -1, \ + 1200, 1300, 1576, -1, -1, \ + 1400, 1500, 1972, -1, -1, \ + 100, 200, 394, -1, 0, \ + 300, 400, 788, -1, 0, \ + 500, 600, 1182, -1, 0] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase11(self): + elements = [ 0, 500, 0, 1, -1, \ + 700, 900, 1180, 2, -1, \ + 100, 200, 392, -1, 0, \ + 300, 400, 786, -1, 0, \ + 710, 720, 1574, -1, 1, \ + 740, 750, 1967, -1, 1] + self._writeBinFile(self._expLFileName, elements) + + def _writeExpLFileCase12(self): + elements = [ 0, 1400, 0, 1, -1, \ + 300, 500, 368, 2, 0, \ + 800, 1100, 1106, -1, 0, \ + 1200, 1300, 1476, -1, 0, \ + 300, 500, 737, -1, 1] + self._writeBinFile(self._expLFileName, elements) + +if __name__ == "__main__": + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_FindOverlapsWithOneInterval.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_FindOverlapsWithOneInterval.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,81 @@ +import unittest +import struct +import os +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.FindOverlapsWithOneInterval import FindOverlapsWithOneInterval +from SMART.Java.Python.ncList.NCListCursor import NCListCursor + +class Test_FindOverlapsWithOneInterval(unittest.TestCase): + + def setUp(self): + self._inputGff3FileName = 'sortedFile.gff3' + self._writeGFF3File(self._inputGff3FileName) + self._obsFileName = "overlap.gff3" + self._iFOWOI = FindOverlapsWithOneInterval(0) + self._iFOWOI.setFileName(self._inputGff3FileName, "gff3") + self._iFOWOI._chromosome = "chr1" + self._iFOWOI.prepareIntermediateFiles() + self._iFOWOI.createNCList() + self._ncList = self._iFOWOI._ncList + self._iFOWOI.setOutputFileName(self._obsFileName) + + def tearDown(self): + return + self._iFOWOI.close() + for file in (self._inputGff3FileName, self._obsFileName): + if os.path.exists(file): + os.remove(file) + + def test_binarySearch_first_element_overlap(self): + self._iFOWOI.setInterval("chr1", 500, 850) + obsReadPosition = self._iFOWOI.binarySearch(NCListCursor(None, self._ncList, 0, 0), 0, 6) + expReadPosition = 0 + self._iFOWOI.dumpWriter() + self._iFOWOI.close() + self.assertEquals(expReadPosition, obsReadPosition._lIndex) + + def test_binarySearch_second_element_overlap(self): + self._iFOWOI.setInterval("chr1", 500, 850) + obsReadPosition = self._iFOWOI.binarySearch(NCListCursor(None, self._ncList, 2, 0), 2, 6) + expReadPosition = 3 + self._iFOWOI.dumpWriter() + self._iFOWOI.close() + self.assertEquals(expReadPosition, obsReadPosition._lIndex) + + def test_binarySearch_empty_subList(self): + self._iFOWOI.setInterval("chr1", 500, 850) + obsReadPosition = self._iFOWOI.binarySearch(NCListCursor(None, self._ncList, 5, 0), 5, 5) + expReadPosition = None + self._iFOWOI.dumpWriter() + self._iFOWOI.close() + self.assertEquals(expReadPosition, obsReadPosition) + + def test_binarySearch_no_overlap_right(self): + self._iFOWOI.setInterval("chr1", 1400, 1500) + obsReadPosition = self._iFOWOI.binarySearch(NCListCursor(None, self._ncList, 0, 0), 0, 6) + expReadPosition = None + self._iFOWOI.dumpWriter() + self._iFOWOI.close() + self.assertEquals(expReadPosition, obsReadPosition) + + def test_binarySearch_no_overlap_left(self): + self._iFOWOI.setInterval("chr1", 0, 45) + obsReadPosition = self._iFOWOI.binarySearch(NCListCursor(None, self._ncList, 2, 0), 2, 6) + expReadPosition = None + self._iFOWOI.dumpWriter() + self._iFOWOI.close() + self.assertEquals(expReadPosition, obsReadPosition) + + def _writeGFF3File(self, fileName): + f = open(fileName, "w") + f.write("chr1\ttest\ttest2.1\t9\t1000\t1001\t+\t.\tID=test2.1;Name=test2.1\n") + f.write("chr1\ttest\ttest2.2\t50\t350\t301\t+\t.\tID=test2.2;Name=test2.2\n") + f.write("chr1\ttest\ttest2.3\t100\t600\t501\t+\t.\tID=test2.3;Name=test2.3\n") + f.write("chr1\ttest\ttest2.4\t200\t450\t251\t+\t.\tID=test2.4;Name=test2.4\n") + f.write("chr1\ttest\ttest2.5\t700\t950\t251\t+\t.\tID=test2.5;Name=test2.5\n") + f.write("chr1\ttest\ttest2.6\t800\t900\t101\t+\t.\tID=test2.6;Name=test2.6\n") + f.write("chr1\ttest\ttest2.7\t1200\t1300\t101\t+\t.\tID=test2.7;Name=test2.7\n") + f.close() + +if __name__ == "__main__": + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_FindOverlapsWithSeveralIntervals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_FindOverlapsWithSeveralIntervals.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,160 @@ +import unittest +import os +from SMART.Java.Python.ncList.FindOverlapsWithSeveralIntervals import FindOverlapsWithSeveralIntervals + +class Test_FindOverlapsWithSeveralIntervals(unittest.TestCase): + + def setUp(self): + self._inputRefGff3FileName = 'sortedFile.gff3' + self._writeGFF3File(self._inputRefGff3FileName) + self._inputQueryGff3FileName = 'sorted_Query.gff3' + self._writeQueryGff3File(self._inputQueryGff3FileName) + self._outputGff3FileName = 'overlaps.gff3' + self._iFOWSI = FindOverlapsWithSeveralIntervals(self._inputRefGff3FileName, self._inputQueryGff3FileName) + self._iFOWSI.setOutputGff3FileName(self._outputGff3FileName) + + def tearDown(self): + os.remove(self._inputRefGff3FileName) + os.remove(self._inputQueryGff3FileName) + os.remove(self._outputGff3FileName) + self._iFOWSI.deletIntermediateFiles() + + def test_isOverlapping_true(self): + queryGff3Addr = 116 + RefGff3Addr = 231 + obs = self._iFOWSI.isOverlapping(queryGff3Addr, RefGff3Addr) + exp = 0 + self.assertEquals(exp, obs) + + def test_isOverlapping_false_left(self): + queryGff3Addr = 116 + RefGff3Addr = 58 + obs = self._iFOWSI.isOverlapping(queryGff3Addr, RefGff3Addr) + exp = -1 + self.assertEquals(exp, obs) + + def test_isOverlapping_false_right(self): + queryGff3Addr = 116 + RefGff3Addr = 347 + obs = self._iFOWSI.isOverlapping(queryGff3Addr, RefGff3Addr) + exp = 1 + self.assertEquals(exp, obs) + + def test_getHisFirstChild(self): + firstRefLAddr = 0 + obsFirstChildLAddr = self._iFOWSI.getHisFirstChild(firstRefLAddr) + expFirstChildLAddr = 48 + self.assertEquals(expFirstChildLAddr, obsFirstChildLAddr) + + def test_isLastElement_true(self): + refLAddr = 96 + obsBool = self._iFOWSI.isLastElement(refLAddr) + expBool = True + self.assertEquals(expBool, obsBool) + + def test_isLastElement_false(self): + refLAddr = 72 + obsBool = self._iFOWSI.isLastElement(refLAddr) + expBool = False + self.assertEquals(expBool, obsBool) + + def test_isLastElement_highestLevel_true(self): + refLAddr = 24 + obsBool = self._iFOWSI.isLastElement(refLAddr) + expBool = True + self.assertEquals(expBool, obsBool) + + def test_isLastElement_highestLevel_false(self): + refLAddr = 0 + obsBool = self._iFOWSI.isLastElement(refLAddr) + expBool = False + self.assertEquals(expBool, obsBool) + + def test_findOverlapIter(self): + queryGff3Addr = 175 + firstRefLAddr = 0 + obsFirstOverlapLAddr = self._iFOWSI.findOverlapIter(queryGff3Addr, firstRefLAddr) + expFirstOverlapLAddr = 0 + self.assertEquals(expFirstOverlapLAddr, obsFirstOverlapLAddr) + + def test_not_findOverlapIter(self): + queryGff3Addr = 295 + firstRefLAddr = 24 + obsFirstOverlapLAddr = self._iFOWSI.findOverlapIter(queryGff3Addr, firstRefLAddr) + expFirstOverlapLAddr = None + self.assertEquals(expFirstOverlapLAddr, obsFirstOverlapLAddr) + + def test_findOverlapIter_not_the_first_RefOverlap(self): + queryGff3Addr = 235 + firstRefLAddr = 0 + obsFirstOverlapLAddr = self._iFOWSI.findOverlapIter(queryGff3Addr, firstRefLAddr) + expFirstOverlapLAddr = 24 + self.assertEquals(expFirstOverlapLAddr, obsFirstOverlapLAddr) + + def test_changeToNewSubEndLAddr(self): + firstChildLAddr = 48 + subEndLAddr = 48 + expSubEndLAddr = 120 + obsSubEndLAddr = self._iFOWSI.changeToNewSubEndLAddr(firstChildLAddr, subEndLAddr) + self.assertEquals(expSubEndLAddr, obsSubEndLAddr) + + def test_defineSubEndLaddr(self): + parentLAddr = -1 + expSubEndLAddr = 48 + obsSubEndLAddr = self._iFOWSI.defineSubEndLaddr(parentLAddr) + self.assertEquals(expSubEndLAddr, obsSubEndLAddr) + + def test_getNextRefIntervalInCaseNotOverLap(self): + firstRefLAddr = 96 + expRefLAddr = 24 + obsRefLAddr = self._iFOWSI.getNextRefIntervalInCaseNotOverLap(firstRefLAddr) + self.assertEquals(expRefLAddr, obsRefLAddr) + + def test_getNextRefIntervalInCaseOverLap(self): + firstChildLAddr = -1 + firstRefLAddr = 120 + subEndLAddr = 144 + expRefLAddr, expSubEndLAddr = (96, 144) + obsRefLAddr, obsSubEndLAddr = self._iFOWSI.getNextRefIntervalInCaseOverLap(firstChildLAddr, firstRefLAddr, subEndLAddr) + self.assertEquals((expRefLAddr, expSubEndLAddr), (obsRefLAddr, obsSubEndLAddr)) + + def test_not_findOverlapIter_between2RefIntervals(self): + inputQueryGff3FileName = 'query2.gff3' + self._writeQueryGff3File2(inputQueryGff3FileName) + self._iFOWSI.setQueryGff3FileName(inputQueryGff3FileName) + queryGff3Addr = 0 + firstRefLAddr = 0 + obsFirstOverlapLAddr = self._iFOWSI.findOverlapIter(queryGff3Addr, firstRefLAddr) + expFirstOverlapLAddr = 24 + self.assertEquals(expFirstOverlapLAddr, obsFirstOverlapLAddr) + os.remove(inputQueryGff3FileName) + + def _writeQueryGff3File2(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest1\t1100\t1150\t126\t+\t.\tID=test1.1;Name=test1.1\n") + f.write("chr1\tquery\ttest2\t1250\t1300\t781\t+\t.\tID=test1.2;Name=test1.2\n") + f.close() + + def _writeQueryGff3File(self, fileName): + f = open(fileName, 'w') + f.write("chr1\tquery\ttest1.1\t25\t150\t126\t+\t.\tID=test1.1;Name=test1.1\n") + f.write("chr1\tquery\ttest1.2\t70\t850\t781\t+\t.\tID=test1.2;Name=test1.2\n") + f.write("chr1\tquery\ttest1.3\t550\t850\t201\t+\t.\tID=test1.3;Name=test1.3\n") + f.write("chr1\tquery\ttest1.4\t925\t1025\t101\t+\t.\tID=test1.4;Name=test1.4\n") + f.write("chr1\tquery\ttest1.5\t1201\t1210\t10\t+\t.\tID=test1.5;Name=test1.5\n") + f.write("chr1\tquery\ttest1.6\t1500\t1600\t101\t+\t.\tID=test1.6;Name=test1.6\n") + f.close() + + def _writeGFF3File(self, fileName): + f = open(fileName, "w") + f.write("chr1\ttest\ttest2.1\t9\t1000\t1001\t+\t.\tID=test2.1;Name=test2.1\n") + f.write("chr1\ttest\ttest2.2\t50\t350\t301\t+\t.\tID=test2.2;Name=test2.2\n") + f.write("chr1\ttest\ttest2.3\t100\t600\t501\t+\t.\tID=test2.3;Name=test2.3\n") + f.write("chr1\ttest\ttest2.4\t200\t450\t251\t+\t.\tID=test2.4;Name=test2.4\n") + f.write("chr1\ttest\ttest2.5\t700\t950\t251\t+\t.\tID=test2.5;Name=test2.5\n") + f.write("chr1\ttest\ttest2.6\t800\t900\t101\t+\t.\tID=test2.6;Name=test2.6\n") + f.write("chr1\ttest\ttest2.7\t1200\t1300\t101\t+\t.\tID=test2.7;Name=test2.7\n") + f.close() + +if __name__ == "__main__": + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_FindOverlaps_randomExample.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_FindOverlaps_randomExample.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,99 @@ +import unittest +import os +import time +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.ncList.test.MockFindOverlaps_randomExample import * +from SMART.Java.Python.ncList.FindOverlaps_naif import FindOverlaps_naif +from SMART.Java.Python.FindOverlapsOptim import FindOverlapsOptim + +class Test_FindOverlaps_randomExample(unittest.TestCase): + + def setUp(self): + self._output_naif = 'output_naif.gff3' + self._outputOptim = 'outputOptim.gff3' + + + def tearDown(self): + return + os.remove(self._output_naif) + os.remove(self._outputOptim) + + def test_run_smallSize(self): + inputRefGff3FileName = 'ref_small.gff3' + numberOfReads = 10 + chromSize = 1000 + iMFO_rand = MockFindOverlaps_randomExample(inputRefGff3FileName, 'reference', numberOfReads, chromSize) + iMFO_rand.write() + + inputQueryGff3FileName = 'query_small.gff3' + iMFO_rand = MockFindOverlaps_randomExample(inputQueryGff3FileName,'query', 10, 1000) + iMFO_rand.write() + + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._output_naif) + iFOO = FindOverlapsOptim(0) + iFOO.setRefFileName(inputRefGff3FileName, "gff3") + iFOO.setQueryFileName(inputQueryGff3FileName, "gff3") + iFOO.setOutputFileName(self._outputOptim) + iFOO.prepareIntermediateFiles() + iFOO.createNCLists() + + startTime_naif = time.time() + iFON.run() + iFON.close() + endTime_naif = time.time() + totalTime_naif = endTime_naif - startTime_naif + print 'for naive algo, we take %e second' % (totalTime_naif) + + startTimeOptim = time.time() + iFOO.compare() + endTimeOptim = time.time() + totalTimeOptim = endTimeOptim - startTimeOptim + print 'for optim algo, we take %e second' % (totalTimeOptim) + iFOO.close() + + self.assertTrue(Utils.diff(self._output_naif, self._outputOptim)) + + os.remove(inputRefGff3FileName) + os.remove(inputQueryGff3FileName) + + + def test_creatRandomExampleWithMOverlaps_smallSize(self): + inputRefGff3FileName = 'refMOverlaps_small.gff3' + inputQueryGff3FileName = 'queryMOverlaps_small.gff3' + numberOfReads = 10 + chromSize = 1000 + iRMSS = MockFindOverlaps_randomExample_MOverlaps(inputRefGff3FileName, inputQueryGff3FileName, 7, numberOfReads, chromSize) + iRMSS.createRandomExample() + + + iFON = FindOverlaps_naif(inputRefGff3FileName, inputQueryGff3FileName) + iFON.setOutputGff3FileName(self._output_naif) + iFOO = FindOverlapsOptim(0) + iFOO.setRefFileName(inputRefGff3FileName, "gff3") + iFOO.setQueryFileName(inputQueryGff3FileName, "gff3") + iFOO.setOutputFileName(self._outputOptim) + iFOO.prepareIntermediateFiles() + iFOO.createNCLists() + + startTime_naif = time.time() + iFON.run() + endTime_naif = time.time() + totalTime_naif = endTime_naif - startTime_naif + print 'for naive algo, we take %e second' % (totalTime_naif) + iFON.close() + + startTimeOptim = time.time() + iFOO.compare() + endTimeOptim = time.time() + totalTimeOptim = endTimeOptim - startTimeOptim + print 'for optim algo, we take %e second' % (totalTimeOptim) + iFOO.close() + + self.assertTrue(Utils.diff(self._output_naif, self._outputOptim)) + + os.remove(inputRefGff3FileName) + os.remove(inputQueryGff3FileName) + +if __name__ == "__main__": + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/Test_randExample.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/ncList/test/Test_randExample.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,51 @@ +import unittest +import time +from SMART.Java.Python.ncList.test.MockFindOverlaps_randomExample import * +from SMART.Java.Python.FindOverlapsOptim import FindOverlapsOptim + +class Test_F_FindOverlaps_randomExample(unittest.TestCase): + + def setUp(self): + self._output_optim = 'output_optim.gff3' + + def test_creatRandomExampleWithMOverlaps(self): + inputRefGff3FileName = 'refMOverlaps.gff3' + inputQueryGff3FileName = 'queryMOverlaps.gff3' + outputDataName = 'timeResult.dat' + fTime = open(outputDataName, 'w') + fTime.write('NbRef\tNbQuery\tNbOverlap\ttime\n') + numberOfRefReads = 1000 + chromSize = 100000 + while numberOfRefReads <= 1000: + numberOfQReads = 1000 + while numberOfQReads <= 1000: + print 'ref size = %d, query size = %d' %(numberOfRefReads, numberOfQReads) + iMFOR_ref = MockFindOverlaps_randomExample(inputRefGff3FileName, 'ref', numberOfRefReads, chromSize) + iMFOR_ref.write() + iMFOR_query = MockFindOverlaps_randomExample(inputQueryGff3FileName,'q', numberOfQReads, chromSize) + iMFOR_query.write() + iFOO = FindOverlapsOptim(0) + iFOO.setRefFileName(inputRefGff3FileName, "gff3") + iFOO.setQueryFileName(inputQueryGff3FileName, "gff3") + iFOO.setOutputFileName(self._output_optim) + iFOO.prepareIntermediateFiles() + iFOO.createNCLists() + + startTime_optim = time.time() + iFOO.compare() + endTime_optim = time.time() + totalTime_optim = endTime_optim - startTime_optim + print 'we took %s second.' % (totalTime_optim) + nbOverlap = iFOO._nbOverlaps + iFOO.close() + fTime.write('%d\t%d\t%d\t%.2f\n' % (numberOfRefReads, numberOfQReads, nbOverlap, totalTime_optim)) + numberOfQReads *= 10 + numberOfRefReads *= 10 + fTime.close() + os.remove(inputQueryGff3FileName) + os.remove(inputRefGff3FileName) + os.remove(self._output_optim) + + +if __name__ == "__main__": + unittest.main() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/ncList/test/__init__.py diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/plot.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/plot.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,223 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +""" +Plot the data from the data files +""" + +import os, re, math +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from commons.core.utils.FileUtils import FileUtils + +class Plot(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.keep = False + + def keepTmpFiles(self, boolean): + self.keep = boolean + + def setShape(self, shape): + self.shape = shape + + def setInputFileName(self, fileName, format): + self.parser = TranscriptContainer(fileName, format, self.verbosity) + + def setXData(self, tag, default): + self.x = tag + self.xDefault = default + + def setYData(self, tag, default): + self.y = tag + self.yDefault = default + + def setZData(self, tag, default): + self.z = tag + self.zDefault = default + + def setNbBars(self, nbBars): + self.nbBars = nbBars + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setRegression(self, regression): + self.regression = regression + + def setLog(self, log): + self.log = log + + def createPlotter(self): + self.plotter = RPlotter(self.outputFileName, self.verbosity, self.keep) + if self.shape == "barplot": + self.plotter.setBarplot(True) + elif self.shape == "line": + pass + elif self.shape == "points": + self.plotter.setPoints(True) + elif self.shape == "heatPoints": + self.plotter.setHeatPoints(True) + else: + raise Exception("Do not understand shape '%s'\n" % (self.shape)) + + self.plotter.setLog(self.log) + self.plotter.setRegression(self.regression) + + def getValues(self, transcript): + x = transcript.getTagValue(self.x) + y = None + z = None + if self.y != None: + y = transcript.getTagValue(self.y) + if self.z != None: + z = transcript.getTagValue(self.z) + if x == None: + if self.xDefault != None: + x = self.xDefault + else: + raise Exception("Error! Transcript %s do not have the x-tag %s\n" % (transcript, self.x)) + if self.y != None: + if y == None: + if self.yDefault != None: + y = self.yDefault + else: + raise Exception("Error! Transcript %s do not have the y-tag %s\n" % (transcript, self.y)) + if self.z != None: + if z == None: + if self.zDefault != None: + z = self.zDefault + else: + raise Exception("Error! Transcript %s do not have the z-tag %s\n" % (transcript, self.z)) + x = float(x) + if self.y != None: + y = float(y) + if self.z != None: + z = float(z) + return (x, y, z) + + def correctPointsToBarplot(self, line): + minValue = int(math.floor(min(line.keys()))) + maxValue = int(math.ceil(max(line.keys()))) + step = (maxValue - minValue) / self.nbBars + values = dict([i * step + minValue, 0] for i in range(0, self.nbBars)) + top = (self.nbBars - 1) * step + minValue + for key, value in line.iteritems(): + newKey = min(top, int(math.floor((key - minValue) / float(maxValue - minValue) * self.nbBars)) * step + minValue) + values[newKey] += value + return values + + def parseFile(self): + line = {} + heatLine = {} + + cpt = 1 + for transcript in self.parser.getIterator(): + x, y, z = self.getValues(transcript) + name = transcript.name + if name == "unnamed transcript": + name = "transcript %d" % (cpt) + cpt += 1 + if self.shape in ("points", "heatPoints"): + line[name] = (x, y) + if self.shape == "heatPoints": + heatLine[name] = z + if self.shape == "line": + line[x] = y + if self.shape == "barplot": + line[x] = line.get(x, 0) + 1 + if self.shape == "barplot": + line = self.correctPointsToBarplot(line) + self.plotter.setXLabel(self.x) + if self.y != None: + self.plotter.setYLabel(self.y) + else: + self.plotter.setYLabel("Count") + self.plotter.addLine(line) + if self.shape == "heatPoints": + self.plotter.addHeatLine(heatLine) + self.plotter.plot() + + def close(self): + if self.regression: + print self.plotter.getCorrelationData() + if self.shape == "points": + rho = self.plotter.getSpearmanRho() + if rho == None: + print "Cannot compute Spearman rho." + else: + print "Spearman rho: %f" % (rho) + + def run(self): + self.createPlotter() + self.parseFile() + self.close() + + +if __name__ == "__main__": + + # parse command line + description = "Plot v1.0.2: Plot some information from a list of transcripts. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-x", "--x", dest="x", action="store", type="string", help="tag for the x value [format: string]") + parser.add_option("-y", "--y", dest="y", action="store", type="string", help="tag for the y value [format: string]") + parser.add_option("-z", "--z", dest="z", action="store", default=None, type="string", help="tag for the z value [format: string]") + parser.add_option("-X", "--xDefault", dest="xDefault", action="store", default=None, type="float", help="value for x when tag is not present [format: float]") + parser.add_option("-Y", "--yDefault", dest="yDefault", action="store", default=None, type="float", help="value for y when tag is not present [format: float]") + parser.add_option("-Z", "--zDefault", dest="zDefault", action="store", default=None, type="float", help="value for z when tag is not present [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file names [format: output file in PNG format]") + parser.add_option("-s", "--shape", dest="shape", action="store", default="barplot", type="string", help="shape of the plot [format: choice (barplot, line, points, heatPoints)]") + parser.add_option("-n", "--nbBars", dest="nbBars", action="store", default=2, type="int", help="number of bars in barplot [format: int]") + parser.add_option("-k", "--keep", dest="keep", action="store_true", default=False, help="keep temporary files [format: bool]") + parser.add_option("-r", "--regression", dest="regression", action="store_true", default=False, help="plot regression line (in 'points' format) [format: bool]") + parser.add_option("-l", "--log", dest="log", action="store", default="y", type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + plot = Plot(options.verbosity) + plot.setInputFileName(options.inputFileName, options.format) + plot.setOutputFileName(options.outputFileName) + plot.setXData(options.x, options.xDefault) + plot.setYData(options.y, options.yDefault) + plot.setZData(options.z, options.zDefault) + plot.setShape(options.shape) + plot.setNbBars(options.nbBars) + plot.setRegression(options.regression) + plot.setLog(options.log) + plot.keepTmpFiles(options.keep) + plot.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/plotCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/plotCoverage.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,473 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, subprocess, glob, random +from optparse import OptionParser +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from commons.core.parsing.FastaParser import FastaParser + +strands = [-1, 1] +colors = {-1: "blue", 1: "red", 0: "black"} +colorLine = "black" + +def parseTargetField(field): + strand = "+" + splittedFieldSpace = field.split() + splittedFieldPlus = field.split("+", 4) + if len(splittedFieldSpace) == 3: + id, start, end = splittedFieldSpace + elif len(splittedFieldSpace) == 4: + id, start, end, strand = splittedFieldSpace + elif len(splittedFieldPlus) == 3: + id, start, end = splittedFieldPlus + elif len(splittedFieldPlus) == 4: + id, start, end, strand = splittedFieldPlus + else: + raise Exception("Cannot parse Target field '%s'." % (field)) + return (id, int(start), int(end), strand) + + +class SimpleTranscript(object): + def __init__(self, transcript1, transcript2, color = None): + self.start = max(0, transcript1.getStart() - transcript2.getStart()) + self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart()) + self.strand = transcript1.getDirection() * transcript2.getDirection() + self.exons = [] + for exon in transcript1.getExons(): + if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd(): + start = max(0, exon.getStart() - transcript2.getStart()) + end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart()) + self.addExon(start, end, self.strand, color) + + def addExon(self, start, end, strand, color): + exon = SimpleExon(start, end, strand, color) + self.exons.append(exon) + + def getRScript(self, yOffset, height): + rString = "" + previousEnd = None + for exon in sorted(self.exons, key=lambda exon: exon.start): + if previousEnd != None: + rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine) + rString += exon.getRScript(yOffset, height) + previousEnd = exon.end + return rString + + +class SimpleExon(object): + def __init__(self, start, end, strand, color = None): + self.start = start + self.end = end + self.strand = strand + self.color = color + + def getRScript(self, yOffset, height): + color = self.color if self.color != None else colors[self.strand] + return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine) + + +class Plotter(object): + + def __init__(self, seed, index, verbosity): + self.seed = seed + self.index = index + self.verbosity = verbosity + self.maxCoverage = 0 + self.maxOverlap = 0 + self.log = "" + self.merge = False + self.width = 1500 + self.heigth = 1000 + self.xLabel = "" + self.yLabel = "" + self.title = None + self.absPath = os.getcwd() + self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index) + self.coverageScript = "" + self.overlapScript = "" + self.outputFileName = None + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setTranscript(self, transcript): + self.transcript = transcript + self.name = transcript.getName() + self.size = transcript.getEnd() - transcript.getStart() + 1 + if self.title == None: + self.title = self.name + else: + self.title += " " + self.name + + def setTitle(self, title): + self.title = title + " " + self.name + + def setPlotSize(self, width, height): + self.width = width + self.height = height + + def setLabels(self, xLabel, yLabel): + self.xLabel = xLabel + self.yLabel = yLabel + + def setMerge(self, merge): + self.merge = merge + + def setCoverageData(self, coverage): + outputCoveragePerStrand = dict([strand, 0] for strand in strands) + outputCoverage = 0 + dataFile = open(os.path.abspath(self.coverageDataFileName), "w") + for position in range(self.size+1): + sumValue = 0 + found = False + dataFile.write("%d\t" % (position)) + for strand in strands: + value = coverage[strand].get(position, 0) + sumValue += value + dataFile.write("%d\t" % (value)) + if value > 0: + found = True + outputCoveragePerStrand[strand] += 1 + self.maxCoverage = max(self.maxCoverage, sumValue) + dataFile.write("%d\n" % (sumValue)) + if found: + outputCoverage += 1 + dataFile.close() + self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100) + self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName)) + self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1]) + self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1]) + self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0]) + + def setOverlapData(self, overlap): + height = 1 + self.maxOverlap = (len(overlap) + 1) * height + thisElement = SimpleTranscript(self.transcript, self.transcript, "black") + self.overlapScript += thisElement.getRScript(0, height) + for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)): + self.overlapScript += transcript.getRScript((cpt + 1) * height, height) + + def getFirstLine(self, suffix = None): + return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height) + + def getLastLine(self): + return "dev.off()\n" + + def startR(self, fileName, script): + scriptFile = open(fileName, "w") + scriptFile.write(script) + scriptFile.close() + command = "R CMD BATCH %s" % (fileName) + status = subprocess.call(command, shell=True) + if status != 0: + raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status)) + + def plot(self): + print "outputfileName is written in :", self.outputFileName + if self.merge: + fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index) + plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title) + script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine() + self.startR(fileName, script) + else: + fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index) + print "overlap file is written in :", fileName + plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title) + script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine() + self.startR(fileName, script) + fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index) + plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title) + script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine() + self.startR(fileName, script) + + +class PlotParser(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.parsers = [None, None] + self.sequenceParser = None + self.seed = random.randint(0, 10000) + self.title = "" + self.merge = False + + def __del__(self): + for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)): + os.remove(fileName) + for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))): + os.remove(fileName) + for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))): + os.remove(fileName) + + def addInput(self, inputNb, fileName, fileFormat): + if fileName == None: + return + self.parsers[inputNb] = TranscriptContainer(fileName, fileFormat, self.verbosity) + if inputNb == 0: + self.parsers[1] = self.parsers[0] + + def addSequence(self, fileName): + if fileName == None: + return + self.sequenceParser = FastaParser(fileName, self.verbosity) + + def setOutput(self, fileName): + self.outputFileName = fileName + + def setPlotSize(self, width, height): + self.width = width + self.height = height + + def setLabels(self, xLabel, yLabel): + self.xLabel = xLabel + self.yLabel = yLabel + + def setTitle(self, title): + self.title = title + + def setMerge(self, merge): + self.merge = merge + + def initializeDataFromSequences(self): + self.sizes = {} + self.coverage = {} + self.overlap = {} + for region in self.sequenceParser.getRegions(): + self.sizes[region] = self.sequenceParser.getSizeOfRegion(region) + self.coverage[region] = {} + self.overlap[region] = [] + for strand in strands: + self.coverage[region][strand] = {} + self.coverage[region][strand][1] = 0 + self.coverage[region][strand][self.sizes[region]] = 0 + + + def initializeDataFromTranscripts(self): + self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) + self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) + self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts())) + self.parsers[0].findData() + progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity) + for cpt, transcript in enumerate(self.parsers[1].getIterator()): + self.coverage[cpt] = {} + self.overlap[cpt] = [] + for strand in strands: + self.coverage[cpt][strand] = {} + self.coverage[cpt][strand][0] = 0 + self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0 + for exon in transcript.getExons(): + self.sizes[cpt] += exon.getSize() + progress.inc() + progress.done() + + def initialize(self): + if self.sequenceParser == None: + self.initializeDataFromTranscripts() + else: + self.initializeDataFromSequences() + + def computeCoverage(self, transcript1, transcript2, id): + strand = transcript1.getDirection() * transcript2.getDirection() + for exon1 in transcript1.getExons(): + for exon2 in transcript2.getExons(): + if exon1.overlapWith(exon2): + for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1): + relativePosition = position - transcript2.getStart() + 1 + self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1 + + def computeOverlap(self, transcript1, transcript2, id): + simpleTranscript = SimpleTranscript(transcript1, transcript2) + self.overlap[id].append(simpleTranscript) + + def compute2TranscriptFiles(self): + progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity) + for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): + for transcript1 in self.parsers[0].getIterator(): + if transcript1.overlapWithExon(transcript2): + self.computeCoverage(transcript1, transcript2, cpt2) + self.computeOverlap(transcript1, transcript2, cpt2) + progress.inc() + progress.done() + + def extractReferenceQuery(self, inputTranscript): + if "Target" not in inputTranscript.getTagNames(): + raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript)) + id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target")) + if id not in self.sizes: + raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript)) + referenceTranscript = Transcript() + referenceTranscript.setChromosome(id) + referenceTranscript.setName(id) + referenceTranscript.setDirection("+") + referenceTranscript.setEnd(self.sizes[id]) + referenceTranscript.setStart(1) + queryTranscript = Transcript() + queryTranscript.setChromosome(id) + queryTranscript.setName(id) + queryTranscript.setStart(start) + queryTranscript.setEnd(end) + queryTranscript.setDirection(strand) + if inputTranscript.getNbExons() > 1: + factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart()) + for exon in inputTranscript.getExons(): + newExon = Interval() + newExon.setChromosome(id) + newExon.setDirection(strand) + if "Target" in inputTranscript.getTagNames(): + id, start, end, strand = parseTargetField(exon.getTagValue("Target")) + newExon.setStart(start) + newExon.setEnd(end) + else: + newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start) + newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start) + queryTranscript.addExon(newExon) + return (referenceTranscript, queryTranscript) + + def compute1TranscriptFiles(self): + progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity) + for transcript in self.parsers[1].getIterator(): + referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript) + self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName()) + self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName()) + progress.inc() + progress.done() + + def compute(self): + if self.sequenceParser == None: + self.compute2TranscriptFiles() + else: + self.compute1TranscriptFiles() + + def plotTranscript(self, index, transcript): + plotter = Plotter(self.seed, index, self.verbosity) + plotter.setOutputFileName(self.outputFileName) + plotter.setTranscript(transcript) + plotter.setTitle(self.title) + plotter.setLabels(self.xLabel, self.yLabel) + plotter.setPlotSize(self.width, self.height) + plotter.setCoverageData(self.coverage[index]) + plotter.setOverlapData(self.overlap[index]) + plotter.setMerge(self.merge) + plotter.plot() + output = plotter.log + return output + + def plot1TranscriptFile(self): + self.outputCoverage = {} + self.outputCoveragePerStrand = {} + output = "" + progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity) + for cpt2, region in enumerate(self.sequenceParser.getRegions()): + transcript = Transcript() + transcript.setName(region) + transcript.setDirection("+") + transcript.setEnd(self.sizes[region]) + transcript.setStart(1) + output += self.plotTranscript(region, transcript) + progress.inc() + progress.done() + if self.verbosity > 0: + print output + + def plot2TranscriptFiles(self): + self.outputCoverage = [0] * self.parsers[1].getNbTranscripts() + self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts() + for cpt in range(self.parsers[1].getNbTranscripts()): + self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands) + progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity) + output = "" + for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): + output += self.plotTranscript(cpt2, transcript2) + progress.inc() + progress.done() + if self.verbosity > 0: + print output + + def plot(self): + if self.sequenceParser == None: + self.plot2TranscriptFiles() + else: + self.plot1TranscriptFile() + + def start(self): + self.initialize() + self.compute() + self.plot() + + +if __name__ == "__main__": + + # parse command line + description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]") + parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]") + parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]") + parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]") + parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]") + parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]") + parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]") + parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]") + parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]") + parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]") + parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + colors[1] = options.plusColor + colors[-1] = options.minusColor + colors[0] = options.sumColor + colorLine = options.lineColor + + pp = PlotParser(options.verbosity) + pp.addInput(0, options.inputFileName1, options.inputFormat1) + pp.addInput(1, options.inputFileName2, options.inputFormat2) + pp.addSequence(options.inputSequence) + if options.working_Dir[-1] != '/': + path = options.working_Dir + '/' + pp.setOutput(path + options.outputFileName) + pp.setPlotSize(options.width, options.height) + pp.setLabels(options.xLabel, options.yLabel) + pp.setTitle(options.title) + pp.setMerge(options.merge) + pp.start() + + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/plotCsv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/plotCsv.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,146 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Plot the data from the data files +""" + +import os +import re +from optparse import OptionParser +from SMART.Java.Python.misc.RPlotter import * +from SMART.Java.Python.misc.Progress import * + + +def mergeData(line1, line2): + if line1.keys() != line2.keys(): + sys.exit("Error! Input files do not correspond to each other! Aborting...") + mergedData = {} + for key in line1: + mergedData[key] = (line1[key], line2[key]) + return mergedData + + + +if __name__ == "__main__": + + # parse command line + description = "Plot CSV v1.0.1: Plot the content of a CSV file. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileNames", action="store", type="string", help="input file [compulsory] [format: file in CSV format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-s", "--shape", dest="shape", action="store", type="string", help="shape of the plot [format: choice (line, bar, points, heatPoints)]") + parser.add_option("-l", "--log", dest="log", action="store", default="", type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string] [default: ]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + plotter = RPlotter(options.outputFileName, options.verbosity) + if options.shape == "bar": + plotter.setBarplot(True) + elif options.shape == "points": + plotter.setPoints(True) + elif options.shape == "heatPoints": + plotter.setHeatPoints(True) + + plotter.setLog(options.log) + + lines = [] + nbsColumns = [] + for inputFileName in options.inputFileNames.split(","): + inputFile = open(inputFileName) + line = {} + nbColumns = None + + for point in inputFile: + point = point.strip() + + m = re.search(r"^\s*(\S+)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s*$", point) + if m != None: + line[m.group(1)] = (float(m.group(2)), float(m.group(3))) + if nbColumns == None: + nbColumns = 3 + nbsColumns.append(nbColumns) + elif nbColumns != 3: + sys.exit("Number of columns changed around line '%s' of file '%s'! Aborting..." % (point, inputFileName)) + else: + m = re.search(r"^\s*(\d+\.?\d*)\s+(\d+\.?\d*)\s*$", point) + if m != None: + line[float(m.group(1))] = float(m.group(2)) + if nbColumns == None: + nbColumns = 2 + nbsColumns.append(nbColumns) + if nbColumns != 2: + sys.exit("Number of columns changed around line '%s' of file '%s'! Aborting..." % (point, inputFileName)) + else: + m = re.search(r"^\s*(\S+)\s+(\d+\.?\d*)\s*$", point) + if m != None: + line[m.group(1)] = float(m.group(2)) + if nbColumns == None: + nbColumns = 1 + nbsColumns.append(nbColumns) + if nbColumns != 1: + sys.exit("Number of columns changed around line '%s' of file '%s'! Aborting..." % (point, inputFileName)) + else: + sys.exit("Do not understand line '%s' of file '%s'! Aborting..." % (point, inputFileName)) + + lines.append(line) + + if len(lines) != len(nbsColumns): + sys.exit("Something is wrong in the input files! Aborting...") + + if options.shape == "bar": + if len(lines) != 1: + sys.exit("Error! Bar plot should have exactly one input file! Aborting...") + if nbsColumns[0] != 2: + sys.exit("Error! Bar plot input file should have exactly two columns! Aborting...") + plotter.addLine(lines[0]) + elif options.shape == "points": + if len(lines) != 2: + sys.exit("Error! Points cloud should have exactly two input file! Aborting...") + if nbsColumns[0] != 2 or nbsColumns[1] != 2: + sys.exit("Error! Points cloud plot input file should have exactly two columns! Aborting...") + plotter.addLine(mergedData(lines[0], lines[1])) + elif options.shape == "heatPoints": + if len(lines) != 3: + sys.exit("Error! Heat points cloud should have exactly three input file! Aborting...") + plotter.addLine(mergeData(lines[0], lines[1])) + plotter.addHeatLine(lines[2]) + elif options.shape == "line": + for i in range(0, len(lines)): + if (nbsColumns[i] != 2): + sys.exit("Error! Curve plot input file should have exactly two columns! Aborting...") + plotter.addLine(lines[i]) + else: + sys.exit("Do not understand shape '%s'" % (options.shape)) + + + plotter.plot() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/plotGenomeCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/plotGenomeCoverage.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,132 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Utils import * + + +class GetGenomeCoverage(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.inputContainer = None + self.referenceParser = None + self.outputFileName = None + self.genomeSize = None + self.coverage = {} + self.distribution = {} + + + def setInputFile(self, fileName, format): + self.inputContainer = TranscriptContainer(fileName, format, self.verbosity) + + + def setOutputFile(self, fileName): + self.outputFileName = fileName + + + def setReference(self, fileName): + self.referenceParser = FastaParser(fileName, self.verbosity) + + + def getReferenceSizes(self): + self.genomeSize = 0 + for chromosome in self.referenceParser.getRegions(): + self.genomeSize += self.referenceParser.getSizeOfRegion(chromosome) + + + def getCoverage(self): + progress = Progress(self.inputContainer.getNbTranscripts(), "Reading reads", self.verbosity) + for transcript in self.inputContainer.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.coverage: + self.coverage[chromosome] = {} + for exon in transcript.getExons(): + for pos in range(exon.getStart(), exon.getEnd() + 1): + if pos not in self.coverage[chromosome]: + self.coverage[chromosome][pos] = 1 + else: + self.coverage[chromosome][pos] += 1 + progress.inc() + progress.done() + + + def getDistribution(self): + nbNucleotides = sum([len(self.coverage[chromosome].keys()) for chromosome in self.coverage]) + progress = Progress(nbNucleotides, "Building distribution", self.verbosity) + for chromosome in self.coverage: + for num in self.coverage[chromosome].values(): + if num not in self.distribution: + self.distribution[num] = 1 + else: + self.distribution[num] += 1 + progress.inc() + progress.done() + self.distribution[0] = self.genomeSize - nbNucleotides + + + def plotDistribution(self): + plotter = RPlotter(self.outputFileName, self.verbosity) + plotter.setFill(0) + plotter.addLine(self.distribution) + plotter.plot() + print "min/avg/med/max reads per base: %d/%.2f/%.1f/%d" % getMinAvgMedMax(self.distribution) + + + def run(self): + self.getReferenceSizes() + self.getCoverage() + self.getDistribution() + self.plotDistribution() + + +if __name__ == "__main__": + + # parse command line + description = "Plot Genome Coverage v1.0.1: Get the coverage of a genome. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="reads file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-r", "--reference", dest="reference", action="store", type="string", help="sequences file [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + getGenomeCoverage = GetGenomeCoverage(options.verbosity) + getGenomeCoverage.setInputFile(options.inputFileName, options.format) + getGenomeCoverage.setOutputFile(options.outputFileName) + getGenomeCoverage.setReference(options.reference) + getGenomeCoverage.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/plotRepartition.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/plotRepartition.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,128 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Plot the data from the data files +""" +import os +from optparse import OptionParser +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Plot Repartition v1.0.1: Plot the repartition of different data on a whole genome. (This tool uses 1 input file only, the different values being stored in the tags. See documentation to know more about it.) [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF3 format]") + parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-c", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-f", "--format", dest="format", action="store", default="png", type="string", help="format of the output file [format: string] [default: png]") + parser.add_option("-r", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-l", "--log", dest="log", action="store", default="", type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + (options, args) = parser.parse_args() + + strands = [1, -1] + strandToString = {1: "+", -1: "-"} + names = [None] if options.names == None else options.names.split(",") + maxs = {} + nbElements = [0 for name in names] + lines = [{} for i in range(len(names))] + if options.colors == None: + colors = [None for i in range(len(names))] + else: + colors = options.colors.split(",") + + parser = GffParser(options.inputFileName, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Reading %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + direction = transcript.getDirection() + start = transcript.getStart() + for i, name in enumerate(names): + if chromosome not in lines[i]: + lines[i][chromosome] = dict([(strand, {}) for strand in strands]) + if chromosome not in maxs: + maxs[chromosome] = transcript.getStart() + else: + maxs[chromosome] = max(maxs[chromosome], start) + if start not in lines[i][chromosome][direction]: + lines[i][chromosome][direction][start] = 0 + thisNbElements = float(transcript.getTagValue(name)) if name != None and name in transcript.getTagNames() else 1 + lines[i][chromosome][direction][start] += thisNbElements * direction + nbElements[i] += thisNbElements + progress.inc() + progress.done() + + if options.normalize: + if options.verbosity >= 10: + print "Normalizing..." + for i, linesPerCondition in enumerate(lines): + for linesPerChromosome in linesPerCondition.values(): + for line in linesPerChromosome.values(): + for key, value in line.iteritems(): + line[key] = value / float(nbElements[i]) * max(nbElements) + if options.verbosity >= 10: + print "... done." + + progress = Progress(len(maxs.keys()), "Plotting", options.verbosity) + for chromosome in maxs: + plot = RPlotter("%s%s.%s" % (options.outputFileName, chromosome.capitalize(), options.format), options.verbosity) + plot.setLog(options.log) + plot.setImageSize(2000, 500) + plot.setFormat(options.format) + if maxs[chromosome] <= 1000: + unit = "nt." + ratio = 1.0 + elif maxs[chromosome] <= 1000000: + unit = "kb" + ratio = 1000.0 + else: + unit = "Mb" + ratio = 1000000.0 + plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) + plot.setYLabel("# reads") + plot.setLegend(True) + for i, name in enumerate(names): + for strand in strands: + correctedLine = dict([(key / ratio, value) for key, value in lines[i][chromosome][strand].iteritems()]) + if name != None: + name = "%s (%s)" % (name.replace("_", " "), strandToString[strand]) + plot.addLine(correctedLine, None, colors[i]) + plot.plot() + progress.inc() + progress.done() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/plotTranscriptList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/plotTranscriptList.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,255 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Plot the data from the data files +""" +import sys +import math +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter + + +class PlotTranscriptList(object): + + def __init__(self, verbosity = 0): + self.inputFileName = None + self.format = None + self.x = None + self.y = None + self.z = None + self.xDefault = None + self.yDefault = None + self.zDefault = None + self.xLabel = None + self.yLabel = None + self.shape = None + self.bucket = None + self.keep = None + self.log = None + self.verbosity = None + + + def setPlotter(self, outputFileName, keep, log, xLabel, yLabel): + self.plotter = RPlotter(outputFileName, self.verbosity, keep) + if self.shape != "barplot": + self.plotter.setLog(log) + self.plotter.setXLabel(xLabel) + self.plotter.setYLabel(yLabel) + + + def setShape(self, shape): + if self.shape == "line": + pass + elif shape == "barplot": + self.plotter.setBarplot(True) + elif shape == "points": + self.plotter.setPoints(True) + elif shape == "heatPoints": + self.plotter.setHeatPoints(True) + else: + sys.exit("Do not understand shape '%s'" % (shape)) + + + def setInput(self, inputFileName, format): + self.parser = TranscriptContainer(inputFileName, format, self.verbosity) + + + def getValues(self, transcript): + x, y, z = None, None, None + x = transcript.getTagValue(self.x) + if self.y != None: + y = transcript.getTagValue(self.y) + if self.z != None: + z = transcript.getTagValue(self.z) + if x == None: + if self.xDefault != None: + x = self.xDefault + else: + sys.exit("Error! Transcript %s do not have the x-tag %s" % (transcript, self.x)) + if y == None and self.shape != "line" and self.shape != "barplot": + if self.yDefault != None: + y = self.yDefault + else: + sys.exit("Error! Transcript %s do not have the y-tag %s" % (transcript, self.y)) + if self.z != None: + if z == None: + if self.zDefault != None: + z = self.zDefault + else: + sys.exit("Error! Transcript %s do not have the z-tag %s" % (transcript, self.z)) + x = float(x) + if self.y != None: + y = float(y) + if self.z != None: + z = float(z) + return (x, y, z) + + + def readFile(self): + cpt = 1 + line = {} + heatLine = {} + for transcript in self.parser.getIterator(): + x, y, z = self.getValues(transcript) + + name = transcript.name + if name == "unnamed transcript": + name = "transcript %d" % (cpt) + cpt += 1 + if self.shape == "points": + line[name] = (x, y) + elif self.shape == "heatPoints": + line[name] = (x, y) + heatLine[name] = z + elif self.shape == "line" or self.shape == "barplot": + if x not in line: + line[x] = 1 + else: + line[x] += 1 + else: + sys.exit("Do not understand shape '%s'" % (self.shape)) + return line, heatLine + + + def putLineInBuckets(self, line): + tmpLine = line + line = {} + for key, value in tmpLine.iteritems(): + line[int(key / float(self.bucket)) * self.bucket] = value + return line + + + def clusterInBarplot(self, line): + nbZeros = 0 + minValue = min(line.keys()) + maxValue = max(line.keys()) + if self.log != "": + if minValue == 0: + minValue = 1000000000 + for value in line: + if value < minValue: + if value == 0: + nbZeros += 1 + else: + minValue = value + minValue = math.log(minValue) + maxValue = math.log(maxValue) + bucketSize = (maxValue - minValue) / self.bucket + tmpLine = line + line = {} + for i in range(int(self.bucket) + 1): + line[i * bucketSize + minValue] = 0 + for key, value in tmpLine.iteritems(): + if self.log != "" and key != 0: + key = math.log(key) + bucketKey = int((key - minValue) / bucketSize) * bucketSize + minValue + if self.log == "" or key != 0: + line[bucketKey] += value +# if self.log != "": +# tmpLine = line +# line = {} +# for key, value in tmpLine.iteritems(): +# line[math.exp(key)] = value + print "%d zeros have been removed" % (nbZeros) + return line + + + def getSpearmanRho(self): + rho = self.plotter.getSpearmanRho() + if rho == None: + print "Cannot compute Spearman rho." + else: + print "Spearman rho: %f" % (rho) + + + def run(self): + line, heatLine = self.readFile() + + if self.shape == "line" and self.bucket != None: + line = self.putLineInBuckets(line) + if self.shape == "barplot": + line = self.clusterInBarplot(line) + + if self.shape == "points" or self.shape == "barplot" or self.shape == "line": + self.plotter.addLine(line) + elif self.shape == "heatPoints": + self.plotter.addLine(line) + self.plotter.addHeatLine(heatLine) + else: + sys.exit("Do not understand shape '%s'" % (self.shape)) + + self.plotter.plot() + + if self.shape == "points" or self.shape == "heatPoints": + self.getSpearmanRho() + + + +if __name__ == "__main__": + + # parse command line + description = "Plot v1.0.2: Plot some information from a list of transcripts. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input",dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format",dest="format", action="store",type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-x", "--x",dest="x",action="store", type="string", help="tag for the x value [format: string]") + parser.add_option("-y", "--y",dest="y",action="store", type="string", help="tag for the y value [format: string]") + parser.add_option("-z", "--z",dest="z", action="store", default=None,type="string", help="tag for the z value [format: string]") + parser.add_option("-X", "--xDefault",dest="xDefault",action="store", default=None,type="float",help="value for x when tag is not present [format: float]") + parser.add_option("-Y", "--yDefault",dest="yDefault",action="store",default=None,type="float",help="value for y when tag is not present [format: float]") + parser.add_option("-Z", "--zDefault",dest="zDefault", action="store",default=None,type="float",help="value for z when tag is not present [format: float]") + parser.add_option("-n", "--xLabel",dest="xLabel",action="store",default="",type="string", help="label on the x-axis [format: string] [default: ]") + parser.add_option("-m", "--yLabel",dest="yLabel",action="store",default="", type="string", help="label on the y-axis [format: string] [default: ]") + parser.add_option("-o", "--output",dest="outputFileName",action="store",type="string", help="output file names [format: output file in PNG format]") + parser.add_option("-s", "--shape",dest="shape",action="store", type="string", help="shape of the plot [format: choice (barplot, line, points, heatPoints)]") + parser.add_option("-b", "--bucket",dest="bucket",action="store",default=None,type="float",help="bucket size (for the line plot) [format: int] [default: 1]") + parser.add_option("-k", "--keep",dest="keep",action="store_true", default=False, help="keep temporary files [format: bool]") + parser.add_option("-l", "--log",dest="log",action="store",default="",type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string] [default: ]") + parser.add_option("-v", "--verbosity",dest="verbosity",action="store",default=1, type="int",help="trace level [format: int]") + (options, args) = parser.parse_args() + + plotTranscriptList = PlotTranscriptList(options.verbosity) + plotTranscriptList.x = options.x + plotTranscriptList.y = options.y + plotTranscriptList.z = options.z + plotTranscriptList.xDefault = options.xDefault + plotTranscriptList.yDefault = options.yDefault + plotTranscriptList.zDefault = options.zDefault + plotTranscriptList.shape = options.shape + plotTranscriptList.bucket = options.bucket + plotTranscriptList.log = options.log + plotTranscriptList.setPlotter(options.outputFileName, options.keep, options.log, options.xLabel, options.yLabel) + plotTranscriptList.setShape(options.shape) + plotTranscriptList.setInput(options.inputFileName, options.format) + plotTranscriptList.run() + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/qualToFastq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/qualToFastq.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,87 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from optparse import OptionParser +from commons.core.parsing.SequenceListParser import SequenceListParser +from SMART.Java.Python.misc.Progress import Progress + +""" +Transform qual and fasta files to a single fastq file +""" + +if __name__ == "__main__": + + # parse command line + description = "Qual To FastQ v1.0.2: Convert a file in FASTA/Qual format to FastQ format. [Category: Conversion]" + + parser = OptionParser(description = description) + parser.add_option("-f", "--fasta", dest="fastaFileName", action="store", type="string", help="input fasta file [compulsory] [format: file in FASTA format]") + parser.add_option("-q", "--qual", dest="qualFileName", action="store", type="string", help="input qual file [compulsory] [format: file in TXT format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in FASTQ format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + fastaFile = open(options.fastaFileName) + qualFile = open(options.qualFileName) + fastqFile = open(options.outputFileName, "w") + + fastaLine = fastaFile.readline().strip() + qualLine = qualFile.readline().strip() + header = None + cpt = 0 + while fastaLine: + if not qualLine: + raise Exception("Qual file is shorter!") + if fastaLine[0] == ">": + header = fastaLine[1:] + if qualLine[0] != ">": + raise Exception("Discrepencies around %s!" % (header)) + fastqFile.write("@%s\n" % (header)) + else: + if qualLine[0] == ">": + raise Exception("Discrepencies around %s!" % (qualLine[1:])) + intQualities = qualLine.split() + if len(intQualities) != len(fastaLine): + raise Exception("Sizes of read and quality diverge in %s!" % (header)) + chrQualities = [chr(min(int(quality), 93) + 33) for quality in intQualities] + fastqFile.write("%s\n+\n%s\n" % (fastaLine, "".join(chrQualities))) + fastaLine = fastaFile.readline().strip() + qualLine = qualFile.readline().strip() + if cpt % 1000 == 0 and options.verbosity > 1: + sys.stdout.write("%d lines read\r" % (cpt)) + sys.stdout.flush() + cpt += 1 + if options.verbosity > 0: + print "%d lines read" % (cpt) + + if qualLine: + raise Exception("Qual file is longer!") + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/removeAllTmpTables.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/removeAllTmpTables.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,64 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Remove all tmp tables in the MySQL database""" + +import os +import glob +from optparse import OptionParser +from SMART.Java.Python.mySql.MySqlConnection import * + + +if __name__ == "__main__": + + description = "Remove Tables v1.0.2: Remove tables in the local MySQL database. [Category: Other]" + + parser = OptionParser(description = description) + parser.add_option("-t", "--tmp", dest="tmp", action="store_true", default=False, help="Remove temporary tables only [format: bool] [default: false]") + parser.add_option("-f", "--files", dest="files", action="store_false", default=True, help="Do not remove temporary files [format: bool] [default: true]") + (options, args) = parser.parse_args() + + print "Removing temporary databases:" + if options.files: + for tmpFile in glob.glob("smartdb*"): + print " removing %s" % (tmpFile) + os.unlink(tmpFile) + print "Removing temporary files:" + if options.files: + for tmpFile in glob.glob("tmp*.dat"): + print " removing %s" % (tmpFile) + os.unlink(tmpFile) + for tmpFile in glob.glob("tmp*.R"): + print " removing %s" % (tmpFile) + os.unlink(tmpFile) + for tmpFile in glob.glob("tmp*.Rout"): + print " removing %s" % (tmpFile) + os.unlink(tmpFile) + diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/removeEmptySequences.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/removeEmptySequences.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,135 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Remove empty sequences from a FASTA or FASTQ file +""" + +import os, random +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from commons.core.parsing.FastqParser import * +from commons.core.writer.FastaWriter import * +from commons.core.writer.FastqWriter import * +from SMART.Java.Python.misc.Progress import * + + +class EmptySequenceRemover(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.inputFileName = None + self.parser = None + self.format = None + self.writer = None + self.forbiddenNames = {} + self.removedNames = {} + + + def setInputFileName(self, fileName, format): + self.inputFileName = fileName + self.format = format + if options.format == "fasta": + self.parser = FastaParser(self.inputFileName, self.verbosity) + elif options.format == "fastq": + self.parser = FastqParser(self.inputFileName, self.verbosity) + else: + sys.exit("Do not understand '%s' file format." % (self.format)) + + + def setOutputFileName(self, fileName): + if options.format == "fasta": + self.writer = FastaWriter("%s.mfa" % (fileName), self.verbosity) + elif options.format == "fastq": + self.writer = FastqWriter("%s.mfq" % (fileName), self.verbosity) + + + def parse(self): + progress = Progress(self.parser.getNbSequences(), "Reading sequences in %s" % (options.inputFileName), options.verbosity) + for sequence in self.parser.getIterator(): + name = sequence.name.split("/")[0] + if name not in self.forbiddenNames: + if sequence.sequence == "": + self.removedNames[name] = 1 + else: + self.writer.addSequence(sequence) + progress.inc() + progress.done() + self.writer.write() + + +if __name__ == "__main__": + + # parse command line + description = "Remove Empty Sequences v1.0.2: Remove all the empty sequences in a list. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: sequence file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 (in case of pair end reads) [format: file in sequence format given by -f] [default: None]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in format given by -f]") + parser.add_option("-p", "--output2", dest="outputFileName2", action="store", default=None, type="string", help="output file 2 (in case of pair end reads) [format: output file in sequence format given by -f] [default: None]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.log: + logHandle = open("%s.log" % options.outputFileName, "w") + + remover = EmptySequenceRemover(options.verbosity) + remover.setInputFileName(options.inputFileName, options.format) + remover.setOutputFileName(options.outputFileName) + remover.parse() + removedNames = remover.removedNames + if options.log: + for name in removedNames: + logHandle.write("Removed '%s' in %s\n" % (name, options.inputFileName)) + nbSequences = remover.parser.getNbSequences() + + newRemovedNames = {} + if options.inputFileName2 != None: + remover = EmptySequenceRemover(options.verbosity) + remover.setInputFileName(options.inputFileName2, options.format) + remover.setOutputFileName(options.outputFileName2) + remover.forbiddenNames = removedNames + remover.parse() + newRemovedNames = remover.removedNames + if options.log: + for name in newRemovedNames: + logHandle.write("Removed '%s' in %s\n" % (name, options.inputFileName2)) + + remover = EmptySequenceRemover(options.verbosity) + remover.setInputFileName(options.inputFileName, options.format) + remover.setOutputFileName(options.outputFileName) + remover.forbiddenNames = newRemovedNames + remover.parse() + + nbRemoved = len(removedNames.keys()) + len(newRemovedNames.keys()) + print "%d over %d sequences are empty (%.2f%%)." % (nbRemoved, nbSequences, float(nbRemoved) / nbSequences * 100) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/removeExonLines.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/removeExonLines.sh Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,2 @@ +#!/bin/bash +sed '/exon/d' $1 diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/repetGffConverter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/repetGffConverter.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,71 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Convert a GFF with REPET format to BED format""" + +import os +from optparse import OptionParser +from commons.core.parsing.GffParser import * +from commons.core.writer.BedWriter import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + description = "Repet GFF Convert v1.0.1: Convert REPET-flavored GFF to normal GFF. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in GFF3 format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + parser = GffParser(options.inputFileName, options.verbosity) + transcripts = dict() + progress = Progress(parser.getNbTranscripts(), "Analyzing file %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + if transcript.feature.endswith("range"): + transcripts[transcript.name] = transcript + elif transcript.feature.endswith("hsp"): + if transcript.name in transcripts: + transcripts[transcript.name].addExon(transcript) + else: + sys.exit("Transcript %s is not defined\n" % (transcript.name)) + else: + sys.exit("Do not understand feature %s" % (transcript.feature)) + progress.inc() + progress.done() + + writer = BedWriter(options.outputFileName, options.verbosity) + for name in transcripts: + writer.addTranscript(transcripts[name]) + + print "%d transcripts out of %d written (%.2f%%)" % (len(transcripts.keys()), parser.getNbTranscripts(), float(len(transcripts.keys())) / parser.getNbTranscripts() * 100) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/restrictFromNucleotides.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/restrictFromNucleotides.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,78 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Remove all dirty sequences""" + +import os +import sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from commons.core.writer.FastaWriter import * +from commons.core.parsing.FastqParser import * +from commons.core.writer.FastqWriter import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * + + +if __name__ == "__main__": + + # parse command line + description = "Restrict from nucleotide v1.0.1: Remove the sequences with ambiguous nucleotides. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="inputFileName", action="store", default="fasta", type="string", help="format of the input and output files [compulsory] [format: sequence file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in sequence format given by -f]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + # treat items + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + sys.exit("Do not understand '%s' format." % (options.format)) + nbSequences = parser.getNbSequences() + print "sequences: %d" % (nbSequences) + + progress = Progress(nbSequences, "Analyzing sequences of %s" % (options.inputFileName), options.verbosity) + nbKept = 0 + for sequence in parser.getIterator(): + if not sequence.containsAmbiguousNucleotides(): + writer.addSequence(sequence) + nbKept += 1 + progress.inc() + progress.done() + + print "%d items, %d kept (%.2f%%)" % (nbSequences, nbKept, float(nbKept) / nbSequences * 100) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/restrictFromSize.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/restrictFromSize.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,94 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the size distribution of a Fasta / BED file""" + +import os +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from commons.core.parsing.FastqParser import * +from SMART.Java.Python.structure.TranscriptContainer import * +from commons.core.writer.TranscriptWriter import * +from commons.core.writer.FastaWriter import * +from commons.core.writer.FastqWriter import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * + + +if __name__ == "__main__": + + # parse command line + description = "Restrict from Size v1.0.1: Select the elements of a list of sequences or transcripts with a given size. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: sequence or transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript or sequence format given by -f]") + parser.add_option("-m", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size [format: int]") + parser.add_option("-M", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + writer = TranscriptWriter(options.outputFileName, options.format, options.verbosity) + + + # treat items + nbItems = parser.getNbItems() + progress = Progress(nbItems, "Analyzing sequences of %s" % (options.inputFileName), options.verbosity) + nbKept = 0 + nbRead = 0 + nbClKept = 0 + nbClRead = 0 + for item in parser.getIterator(): + size = item.getSize() + nb = 1 if options.format in ("fasta", "fastq") or "nbElements" not in item.getTagNames() else float(item.getTagValue("nbElements")) + nbRead += nb + nbClRead += 1 + if (options.minSize == None or options.minSize <= size) and (options.maxSize == None or options.maxSize >= size): + writer.addElement(item) + nbKept += nb + nbClKept += 1 + progress.inc() + progress.done() + + writer.write() + + print "%d items, %d kept (%.2f%%)" % (nbRead, nbKept, 0 if nbItems == 0 else float(nbKept) / nbItems * 100) + if nbKept != nbClKept or nbRead != nbClRead: + print "%d clusters, %d kept (%.2f%%)" % (nbClRead, nbClKept, 0 if nbClRead == 0 else float(nbClKept) / nbClRead * 100) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/restrictSequenceList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/restrictSequenceList.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,113 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Restrict a sequence list with some names""" + +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.WriterChooser import WriterChooser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + +class RestrictSequenceList(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.exclude = False + + def setInputFileName(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setExclusion(self, boolean): + self.exclude = boolean + + def setOutputFileName(self, fileName, format): + chooser = WriterChooser(self.verbosity) + chooser.findFormat(format) + self.writer = chooser.getWriter(fileName) + + def setNamesFileName(self, fileName): + self.namesFileName = fileName + + def _readNames(self): + self.names = [] + handle = open(self.namesFileName) + for name in handle: + self.names.append(name.strip()) + handle.close() + + def _write(self): + nbElements = self.parser.getNbItems() + progress = Progress(nbElements, "Parsing input file", self.verbosity) + nbRead = 0 + nbWritten = 0 + for element in self.parser.getIterator(): + name = element.getName() + nbRead += 1 + if Utils.xor(name in self.names, self.exclude): + self.writer.addElement(element) + nbWritten += 1 + if name in self.names: + self.names.remove(name) + progress.inc() + progress.done() + if self.verbosity > 0: + print "%d read" % (nbRead) + print "%d written (%d%%)" % (nbWritten, 0 if nbRead == 0 else round(float(nbWritten) / nbRead * 100)) + + def run(self): + self._readNames() + self._write() + if self.names: + print "Some names are not present in the file: %s" % ", ".join(self.names) + + + +if __name__ == "__main__": + + description = "Restrict Sequence List v1.0.1: Keep the elements of a list of sequences whose name is mentionned in a given file. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFile", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default="fasta", type="string", help="format of the input and output files [compulsory] [format: sequence file format] [default: fasta]") + parser.add_option("-n", "--name", dest="names", action="store", type="string", help="names of the transcripts [compulsory] [format: file in TXT format]") + parser.add_option("-o", "--output", dest="outputFile", action="store", type="string", help="output file [format: output file in sequence format given by -f]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="output all those whose name is NOT on the list [format: boolean]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + rsl = RestrictSequenceList(options.verbosity) + rsl.setInputFileName(options.inputFile, options.format) + rsl.setOutputFileName(options.outputFile, options.format) + rsl.setNamesFileName(options.names) + rsl.setExclusion(options.exclude) + rsl.run() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/restrictTranscriptList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/restrictTranscriptList.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,85 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Restrict a transcript list with some parameters (regions)""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.misc.Progress import Progress + +STRAND2DIRECTION = {"+": 1, "-": -1, None: None} + +if __name__ == "__main__": + + # parse command line + description = "Restrict Transcript List v1.0.2: Keep the coordinates which are located in a given position. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format [compulsory] [format: transcript file format]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end [format: int]") + parser.add_option("-t", "--strand", dest="strand", action="store", default=None, type="string", help="strand (+ or -) [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + writer = TranscriptWriter(options.outputFileName, options.format, options.verbosity) + + direction = STRAND2DIRECTION[options.strand] + + nbTranscripts = parser.getNbTranscripts() + progress = Progress(nbTranscripts, "Parsing file %s" % (options.inputFileName), options.verbosity) + + nbTotal = 0 + nbKept = 0 + for transcript in parser.getIterator(): + progress.inc() + nbTotal += 1 + if options.chromosome != None and options.chromosome != transcript.getChromosome(): + continue + if options.start != None and options.start > transcript.getEnd(): + continue + if options.end != None and options.end < transcript.getStart(): + continue + if options.end != None and options.end < transcript.getStart(): + continue + if direction != None and direction != transcript.getDirection(): + continue + nbKept += 1 + writer.addTranscript(transcript) + progress.done() + + writer.write() + + print "%d out of %d are kept (%f%%)" % (nbKept, nbTotal, (float(nbKept) / nbTotal * 100)) diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/runRandomJobs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/runRandomJobs.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,46 @@ +import unittest +import os +import time +from optparse import OptionParser +from SMART.Java.Python.ncList.test.MockFindOverlaps_randomExample import MockFindOverlaps_randomExample +from SMART.Java.Python.FindOverlapsOptim import FindOverlapsOptim + +if __name__ == '__main__': + description = "runRandomJobs: create random ref/query files (with size given), and run the jobs on cluster with help of runJobs.sh" + + parser = OptionParser(description = description) + parser.add_option("-i", "--inputRef", dest="inputRefGff3FileName", action="store", type="string", help="Reference input file [compulsory] [format: file in gff3 format]") + parser.add_option("-j", "--inputQuery", dest="inputQueryGff3FileName", action="store", type="string", help="Query input file [compulsory] [format: file in gff3 format]") + parser.add_option("-m", "--inputRefSize", dest="numberOfRefReads", action="store", type="int", help="The number of Reference") + parser.add_option("-n", "--inputQuerySize", dest="numberOfQReads", action="store", type="int", help="The number of Query") + parser.add_option("-o", "--output", dest="outputGff3FileName", action="store", type="string", help="output file [compulsory] [format: output file in gff3 format]") + (options, args) = parser.parse_args() + + outputDataName = 'timeResult.dat' + fTime = open(outputDataName, 'w') + fTime.write('NbRef\tNbQuery\tNbOverlap\ttime\n') + chromSize = 100000 + print 'ref size = %d, query size = %d' %(options.numberOfRefReads, options.numberOfQReads) + iMFOR_ref = MockFindOverlaps_randomExample(options.inputRefGff3FileName, 'ref', options.numberOfRefReads, chromSize) + iMFOR_ref.write() + cmd_ref = 'sort -f -n -k4 -k5.4rn -o %s %s' % (options.inputRefGff3FileName, options.inputRefGff3FileName) + os.system(cmd_ref) + iMFOR_query = MockFindOverlaps_randomExample(options.inputQueryGff3FileName,'q', options.numberOfQReads, chromSize) + iMFOR_query.write() + cmd_query = 'sort -f -n -k4 -k5.4rn -o %s %s' % (options.inputQueryGff3FileName, options.inputQueryGff3FileName) + os.system(cmd_query) + iFOO = FindOverlaps_optim(options.inputRefGff3FileName, options.inputQueryGff3FileName) + iFOO.setOutputGff3FileName(options.outputGff3FileName) + + startTime_optim = time.time() + iFOO.run() + iFOO.close() + nbOverlap = iFOO.getNbOverlap() + endTime_optim = time.time() + cmd = 'sort -f -n -k4 -k5.4rn -k9.5 -t ";" -o %s %s' % (options.outputGff3FileName, options.outputGff3FileName) + os.system(cmd) + totalTime_optim = endTime_optim - startTime_optim + print 'we take %s second.' % (totalTime_optim) + fTime.write('%d\t%d\t%d\t%.2f\n'%(options.numberOfRefReads, options.numberOfQReads, nbOverlap, totalTime_optim)) + iFOO.deletIntermediateFiles() + fTime.close() diff -r 000000000000 -r e0f8dcca02ed smart_toolShed/SMART/Java/Python/selectByNbOccurrences.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/selectByNbOccurrences.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,89 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Select the transcript that have not more that a given number of occurrences""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import * +from commons.core.writer.Gff3Writer import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * + + +if __name__ == "__main__": + + # parse command line + description = "Select by # of Occurrences v1.0.1: Keep the reads which have mapped less than a given number of times. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-n", "--occurrences", dest="occurrences", action="store", default=1, type="int", help="maximum number of occurrences allowed [format: int] [default: 1]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="mySQL output [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + # get occurrences of the transcripts + names = dict() + progress = Progress(parser.getNbTranscripts(), "Reading names of %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + name = transcript.name + if name not in names: + names[name] = 1 + else: + names[name] += 1 + progress.inc() + progress.done() + + # write output file + nbWritten = 0 + writer = Gff3Writer(options.outputFileName, options.verbosity) + if options.mysql: + mysqlWriter = MySqlTranscriptWriter(options.outputFileName, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Writing transcripts", options.verbosity) + for transcript in parser.getIterator(): + name = transcript.name + if names[name] <= options.occurrences: + nbWritten += 1 + writer.addTranscript(transcri