comparison smart_toolShed/commons/core/utils/FileUtils.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import os
33 import glob
34 import shutil
35 import sys
36 import re
37 import math
38 try:
39 import hashlib
40 except:
41 pass
42
43
44 class FileUtils( object ):
45
46 ## Return the number of lines in the given file
47 #
48 def getNbLinesInSingleFile( fileName ):
49 fileHandler = open( fileName, "r" )
50 lines = fileHandler.readlines()
51 fileHandler.close()
52 if (len(lines)>0 and lines[-1]== "\n"):
53 return (len(lines)-1)
54 else :
55 return len(lines)
56
57 getNbLinesInSingleFile = staticmethod( getNbLinesInSingleFile )
58
59 ## Return the number of lines in the files in the given list
60 #
61 def getNbLinesInFileList( lFileNames ):
62 count = 0
63 for fileName in lFileNames:
64 count += FileUtils.getNbLinesInSingleFile( fileName )
65 return count
66
67 getNbLinesInFileList = staticmethod( getNbLinesInFileList )
68
69 ## Return True if the given file exists, False otherwise
70 #
71 def isRessourceExists( fileName ):
72 return os.path.exists( fileName )
73
74 isRessourceExists = staticmethod( isRessourceExists )
75
76 ## Return True if the given file is empty, False otherwise
77 #
78 def isEmpty( fileName ):
79 return 0 == FileUtils.getNbLinesInSingleFile( fileName )
80
81 isEmpty = staticmethod( isEmpty )
82
83 ## Return True if both files are identical, False otherwise
84 #
85 def are2FilesIdentical( file1, file2 ):
86 tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) )
87 cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile )
88 returnStatus = os.system( cmd )
89 if returnStatus != 0:
90 msg = "ERROR: 'diff' returned '%i'" % ( returnStatus )
91 sys.stderr.write( "%s\n" % msg )
92 sys.stderr.flush()
93 os.remove( tmpFile )
94 return False
95 if FileUtils.isEmpty( tmpFile ):
96 os.remove( tmpFile )
97 return True
98 else:
99 os.remove( tmpFile )
100 return False
101
102 are2FilesIdentical = staticmethod( are2FilesIdentical )
103
104 ## Return a string with all the content of the files in the given list
105 #
106 def getFileContent( lFiles ):
107 content = ""
108 lFiles.sort()
109 for fileName in lFiles:
110 currentFile = open( fileName, "r" )
111 content += currentFile.read()
112 currentFile.close()
113 return content
114
115 getFileContent = staticmethod( getFileContent )
116
117 ## Save content of the given file after having sorted it
118 #
119 def sortFileContent( inFile, outFile="" ):
120 inFileHandler = open(inFile, "r" )
121 lines = inFileHandler.readlines()
122 inFileHandler.close()
123 lines.sort()
124 if outFile == "":
125 outFile = inFile
126 outFileHandler = open( outFile, "w" )
127 outFileHandler.writelines( lines )
128 outFileHandler.close()
129
130 sortFileContent = staticmethod( sortFileContent )
131
132 ## Add end-of-line symbol to the given file content if necessary
133 #
134 def addNewLineAtTheEndOfFileContent( fileContent ):
135 if not fileContent.endswith('\n') and len(fileContent) != 0:
136 fileContent += '\n'
137 return fileContent
138
139 addNewLineAtTheEndOfFileContent = staticmethod( addNewLineAtTheEndOfFileContent )
140
141 ## Concatenate files in the given list
142 #
143 def catFilesFromList( lFiles, outFile, sort=True, skipHeaders = False, separator = "" ):
144 if sort:
145 lFiles.sort()
146 outFileHandler = open( outFile, "a" )
147 isFirstFile = True
148 for singleFile in lFiles:
149 if not isFirstFile:
150 outFileHandler.write(separator)
151 isFirstFile = False
152 singleFileHandler = open( singleFile, "r" )
153 if skipHeaders:
154 singleFileHandler.readline()
155 line = singleFileHandler.readline()
156 while line:
157 outFileHandler.write(line)
158 line = singleFileHandler.readline()
159 singleFileHandler.close()
160 outFileHandler.close()
161
162 catFilesFromList = staticmethod( catFilesFromList )
163
164 ## Concatenate files according to the given pattern
165 #
166 def catFilesByPattern( pattern, outFile, skipHeaders = False, separator = "" ):
167 lFiles = glob.glob( pattern )
168 FileUtils.catFilesFromList( lFiles, outFile, skipHeaders = skipHeaders, separator = separator )
169
170 catFilesByPattern = staticmethod( catFilesByPattern )
171
172 ## Remove files listed according to the given pattern
173 #
174 # @example prefix="/home/tmp/dummy*.txt"
175 #
176 def removeFilesByPattern( prefix ):
177 lFiles = glob.glob( prefix )
178 for f in lFiles:
179 os.remove( f )
180
181 removeFilesByPattern = staticmethod( removeFilesByPattern )
182
183 ## Remove files listed according to the suffixes in the given list
184 #
185 def removeFilesBySuffixList( targetPath, lSuffixes ):
186 if targetPath[-1] == "/":
187 targetPath = targetPath[:-1]
188 for suffix in lSuffixes:
189 pattern = "%s/*%s" % ( targetPath, suffix )
190 FileUtils.removeFilesByPattern( pattern )
191
192 removeFilesBySuffixList = staticmethod( removeFilesBySuffixList )
193
194 ## Remove repeated blanks in the given file
195 #
196 def removeRepeatedBlanks( inFile, outFile="" ):
197 if outFile == "":
198 outFile = inFile
199 tmpFile = "tr_%s_%s" % ( inFile, outFile )
200 cmd = "tr -s ' ' < %s > %s" % ( inFile, tmpFile )
201 os.system( cmd )
202 os.rename( tmpFile, outFile )
203
204 removeRepeatedBlanks = staticmethod( removeRepeatedBlanks )
205
206 ## Remove files in the given list
207 #
208 @staticmethod
209 def removeFilesFromList(lFiles):
210 for f in lFiles:
211 os.remove(f)
212
213 ## Remove files in the given list if exist
214 #
215 @staticmethod
216 def removeFilesFromListIfExist(lFiles):
217 for fileName in lFiles:
218 if FileUtils.isRessourceExists(fileName):
219 os.remove(fileName)
220
221 ## Append the content of a file to another file
222 #
223 # @param inFile string name of the input file
224 # @param outFile string name of the output file
225 #
226 def appendFileContent( inFile, outFile ):
227 outFileHandler = open( outFile, "a" )
228 inFileHandler = open( inFile, "r" )
229 shutil.copyfileobj( inFileHandler, outFileHandler )
230 inFileHandler.close()
231 outFileHandler.close()
232
233 appendFileContent = staticmethod( appendFileContent )
234
235
236 ## Replace Windows end-of-line by Unix end-of-line
237 #
238 def fromWindowsToUnixEof( inFile ):
239 tmpFile = "%s.tmp" % ( inFile )
240 shutil.copyfile( inFile, tmpFile )
241 os.remove( inFile )
242 tmpFileHandler = open( tmpFile, "r" )
243 inFileHandler = open( inFile, "w" )
244 while True:
245 line = tmpFileHandler.readline()
246 if line == "":
247 break
248 inFileHandler.write( line.replace("\r\n","\n") )
249 tmpFileHandler.close()
250 inFileHandler.close()
251 os.remove( tmpFile )
252
253 fromWindowsToUnixEof = staticmethod( fromWindowsToUnixEof )
254
255
256 ## Remove duplicated lines in a file
257 #
258 # @note it preserves the initial order and handles blank lines
259 #
260 def removeDuplicatedLines( inFile ):
261 tmpFile = "%s.tmp" % ( inFile )
262 shutil.copyfile( inFile, tmpFile )
263 os.remove( inFile )
264
265 tmpFileHandler = open( tmpFile, "r" )
266 lLines = list( tmpFileHandler.read().split("\n") )
267 if lLines[-1] == "":
268 del lLines[-1]
269 sLines = set( lLines )
270 tmpFileHandler.close()
271 os.remove( tmpFile )
272
273 inFileHandler = open( inFile, "w" )
274 for line in lLines:
275 if line in sLines:
276 inFileHandler.write( "%s\n" % ( line ) )
277 sLines.remove( line )
278 inFileHandler.close()
279
280 removeDuplicatedLines = staticmethod( removeDuplicatedLines )
281
282
283 ## Write a list of lines in a given file
284 #
285 def writeLineListInFile( inFile, lLines ):
286 inFileHandler = open( inFile, "w" )
287 for line in lLines:
288 inFileHandler.write( line )
289 inFileHandler.close()
290
291 writeLineListInFile = staticmethod( writeLineListInFile )
292
293
294 ## Give the list of absolute path of each directory in the given directory
295 #
296 # @param rootPath string absolute path of the given directory
297 #
298 # @return lDirPath list of absolute directory path
299 #
300 def getAbsoluteDirectoryPathList(rootPath):
301 lDirPath = []
302 lPaths = glob.glob(rootPath + "/*")
303 for ressource in lPaths:
304 if os.path.isdir(ressource) :
305 lDirPath.append(ressource)
306 return lDirPath
307
308 getAbsoluteDirectoryPathList = staticmethod(getAbsoluteDirectoryPathList)
309
310
311 ## Get a sublist of which each element matches/doesn't match a pattern
312 #
313 # @param lPath string list of paths
314 #
315 # @param pattern string pattern
316 #
317 # @param match bool
318 #
319 # @return lPathMatching list of path matching pattern
320 #
321 def getSubListAccordingToPattern(lPath, pattern, match = True):
322 lPathMatching = []
323 for path in lPath:
324 if match:
325 if re.match(".*%s.*" % pattern, path):
326 lPathMatching.append(path)
327 else:
328 if not re.match(".*%s.*" % pattern, path):
329 lPathMatching.append(path)
330 return lPathMatching
331
332 getSubListAccordingToPattern = staticmethod(getSubListAccordingToPattern)
333
334
335 ## Give the list of file names found in the given directory
336 #
337 # @param dirPath string absolute path of the given directory
338 #
339 # @return lFilesInDir list of file names
340 #
341 def getFileNamesList( dirPath, patternFileFilter = ".*" ):
342 lFilesInDir = []
343 lPaths = glob.glob( dirPath + "/*" )
344 for ressource in lPaths:
345 if os.path.isfile( ressource ):
346 fileName = os.path.basename( ressource )
347 if re.match(patternFileFilter, fileName):
348 lFilesInDir.append( fileName )
349 return lFilesInDir
350
351 getFileNamesList = staticmethod( getFileNamesList )
352
353 ## Return the MD5 sum of a file
354 #
355 def getMd5SecureHash( inFile ):
356 if "hashlib" in sys.modules:
357 md5 = hashlib.md5()
358 inFileHandler = open( inFile, "r" )
359 while True:
360 line = inFileHandler.readline()
361 if line == "":
362 break
363 md5.update( line )
364 inFileHandler.close()
365 return md5.hexdigest()
366 else:
367 return ""
368
369 getMd5SecureHash = staticmethod( getMd5SecureHash )
370
371 ## Cat all files of a given directory
372 #
373 # @param dir string directory name
374 # @param outFileName string output file name
375 #
376 def catFilesOfDir(dir, outFileName):
377 lFiles = FileUtils.getFileNamesList(dir)
378 lFile2 = []
379 for file in lFiles:
380 lFile2.append(dir + "/" + file)
381 FileUtils.catFilesFromList(lFile2, outFileName)
382
383 catFilesOfDir = staticmethod(catFilesOfDir)
384
385 ## Return True if size file > 0 octet
386 #
387 # @param fileName string file name
388 #
389 def isSizeNotNull(fileName):
390 size = os.path.getsize(fileName)
391 if size > 0:
392 return True
393 return False
394
395 isSizeNotNull = staticmethod(isSizeNotNull)
396
397 ## Split one file into N Files by lines
398 #
399 # @param fileName string file name
400 # @param N int number of files to create
401 #
402 @staticmethod
403 def splitFileIntoNFiles(fileName, N):
404 nbLine = FileUtils.getNbLinesInSingleFile(fileName)
405 nbLinesInEachFile = nbLine
406 if N > nbLine:
407 N = nbLine
408 if N != 0:
409 nbLinesInEachFile = math.ceil(float(nbLine) / N)
410 else:
411 N = 1
412 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
413 fileHandler = open(fileName, "r")
414 for i in range(1,N+1):
415 with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f:
416 j = 0
417 while j < nbLinesInEachFile:
418 j += 1
419 f.write(fileHandler.readline())
420 fileHandler.close()
421
422 ## Split one file into files of N lines
423 #
424 # @param fileName string input file name
425 # @param N int lines number per files
426 #
427 @staticmethod
428 def splitFileAccordingToLineNumber(fileName, N):
429 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
430 with open(fileName) as inF:
431 fileNb = 1
432 line = inF.readline()
433 if not line or N == 0:
434 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
435 f = open(outFileName, "wb")
436 shutil.copyfileobj(open(fileName, "rb"), f)
437 f.close()
438 else:
439 while line:
440 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
441 with open(outFileName, "w") as outF:
442 lineNb = 1
443 while lineNb <= N and line:
444 outF.write(line)
445 line = inF.readline()
446 lineNb += 1
447 fileNb += 1