| 18 | 1 #include <algorithm> | 
|  | 2 #include <set> | 
|  | 3 #include <vector> | 
|  | 4 #include <sstream> | 
|  | 5 #include "inputFileParser.hpp" | 
|  | 6 | 
|  | 7 static const unsigned int MAX_SIZE = 10000; | 
|  | 8 | 
|  | 9 InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) { | 
|  | 10     outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.')); | 
|  | 11 } | 
|  | 12 | 
|  | 13 | 
|  | 14 void InputFileParser::parse() { | 
|  | 15     ifstream file; | 
|  | 16     string line; | 
|  | 17     file.open(inputFileName.c_str()); | 
|  | 18     if (file.is_open()) { | 
|  | 19         GenomicInterval genomicInterval; | 
|  | 20         while (file.good()) { | 
|  | 21             getline(file, line); | 
|  | 22             if (line.size() > 0) { | 
|  | 23                 genomicInterval.parseFromLine(line); | 
|  | 24                 addToList(genomicInterval); | 
|  | 25             } | 
|  | 26         } | 
|  | 27         syncFiles(); | 
|  | 28         file.close(); | 
|  | 29     } | 
|  | 30     else { | 
|  | 31         cout << "Unable to open file" << inputFileName; | 
|  | 32     } | 
|  | 33     merge(); | 
|  | 34 } | 
|  | 35 | 
|  | 36 | 
|  | 37 void InputFileParser::addToList(GenomicInterval &genomicInterval) { | 
|  | 38     Interval interval (genomicInterval); | 
|  | 39     IntervalsType *intervals; | 
|  | 40     SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome); | 
|  | 41     if (iter == sortedIntervals.end()) { | 
|  | 42         intervals = new IntervalsType; | 
|  | 43         sortedIntervals[genomicInterval.chromosome] = intervals; | 
|  | 44     } | 
|  | 45     else { | 
|  | 46         intervals = iter->second; | 
|  | 47     } | 
|  | 48     //cout << "pushing " << interval.start << "-" << interval.end << endl; | 
|  | 49     intervals->push_back(&interval); | 
|  | 50     if (intervals->size() >= MAX_SIZE) { | 
|  | 51         writeTmpFile(genomicInterval.chromosome); | 
|  | 52     } | 
|  | 53 } | 
|  | 54 | 
|  | 55 | 
|  | 56 void InputFileParser::writeTmpFile(string &chromosome) { | 
|  | 57     SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome); | 
|  | 58     IntervalsType *intervals = iter->second; | 
|  | 59 | 
|  | 60     sort(intervals->begin(), intervals->end()); | 
|  | 61     string fileName = getTmpName(chromosome); | 
|  | 62     ofstream file(fileName.c_str(), ios::out | ios::binary); | 
|  | 63     for (unsigned i = 0; i < intervals->size(); i++) { | 
|  | 64         cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl; | 
|  | 65         (*intervals)[i]->writeBinary(file); | 
|  | 66     } | 
|  | 67     file.close(); | 
|  | 68     ++counter[chromosome]; | 
|  | 69 | 
|  | 70     sortedIntervals[chromosome] = NULL; | 
|  | 71     delete intervals; | 
|  | 72 } | 
|  | 73 | 
|  | 74 | 
|  | 75 void InputFileParser::syncFiles() { | 
|  | 76     for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { | 
|  | 77         string chromosome = iter->first; | 
|  | 78         writeTmpFile(chromosome); | 
|  | 79     } | 
|  | 80 } | 
|  | 81 | 
|  | 82 | 
|  | 83 string InputFileParser::getTmpName(const string &chromosome, unsigned int i) { | 
|  | 84     stringstream s; | 
|  | 85     s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp"; | 
|  | 86     return s.str(); | 
|  | 87 } | 
|  | 88 | 
|  | 89 | 
|  | 90 string InputFileParser::getTmpName(const string &chromosome) { | 
|  | 91     return getTmpName(chromosome, counter[chromosome]); | 
|  | 92 } | 
|  | 93 | 
|  | 94 | 
|  | 95 void InputFileParser::merge() { | 
|  | 96     ofstream outputFile(outputFileName.c_str()); | 
|  | 97     for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { | 
|  | 98         merge(iter->first, outputFile); | 
|  | 99     } | 
|  | 100 } | 
|  | 101 | 
|  | 102 | 
|  | 103 void InputFileParser::merge(const string &chromosome, ofstream &outputFile) { | 
|  | 104     ifstream *files = new ifstream[counter[chromosome]]; | 
|  | 105     set<NumberIntervalType *> intervals; | 
|  | 106     for (unsigned int i = 0; i < counter[chromosome]; i++) { | 
|  | 107         string fileName = getTmpName(chromosome, i); | 
|  | 108         files[i].open(fileName.c_str()); | 
|  | 109     } | 
|  | 110     for (unsigned int i = 0; i < counter[chromosome]; i++) { | 
|  | 111         if (files[i].good()) { | 
|  | 112             Interval interval; | 
|  | 113             interval.parseBinary(files[i]); | 
|  | 114             NumberIntervalType ni = NumberIntervalType(&interval, i); | 
|  | 115             intervals.insert(&ni); | 
|  | 116         } | 
|  | 117     } | 
|  | 118     while (! intervals.empty()) { | 
|  | 119         NumberIntervalType *ni = *intervals.begin(); | 
|  | 120         GenomicInterval gi(chromosome, ni->first->start, ni->first->end); | 
|  | 121         outputFile << gi; | 
|  | 122         intervals.erase(intervals.begin()); | 
|  | 123         if (files[ni->second].good()) { | 
|  | 124             Interval interval; | 
|  | 125             interval.parseBinary(files[ni->second]); | 
|  | 126             NumberIntervalType nni = NumberIntervalType(&interval, ni->second); | 
|  | 127             intervals.insert(&nni); | 
|  | 128         } | 
|  | 129     } | 
|  | 130     for (unsigned int i = 0; i < counter[chromosome]; i++) { | 
|  | 131         files[i].close(); | 
|  | 132     } | 
|  | 133     delete[] files; | 
|  | 134 } |