view SMART/Java/Python/Cpp/inputFileParser.cpp @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line source

#include <algorithm>
#include <set>
#include <vector>
#include <sstream>
#include "inputFileParser.hpp"

static const unsigned int MAX_SIZE = 10000;

InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) {
    outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.'));
}


void InputFileParser::parse() {
    ifstream file;
    string line;
    file.open(inputFileName.c_str());
    if (file.is_open()) {
        GenomicInterval genomicInterval;
        while (file.good()) {
            getline(file, line);
            if (line.size() > 0) {
                genomicInterval.parseFromLine(line);
                addToList(genomicInterval);
            }
        }
        syncFiles();
        file.close();
    }
    else {
        cout << "Unable to open file" << inputFileName;
    }
    merge();
}


void InputFileParser::addToList(GenomicInterval &genomicInterval) {
    Interval interval (genomicInterval);
    IntervalsType *intervals;
    SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome);
    if (iter == sortedIntervals.end()) {
        intervals = new IntervalsType;
        sortedIntervals[genomicInterval.chromosome] = intervals;
    }
    else {
        intervals = iter->second;
    }
    //cout << "pushing " << interval.start << "-" << interval.end << endl;
    intervals->push_back(&interval);
    if (intervals->size() >= MAX_SIZE) {
        writeTmpFile(genomicInterval.chromosome);
    }
}


void InputFileParser::writeTmpFile(string &chromosome) {
    SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome);
    IntervalsType *intervals = iter->second;

    sort(intervals->begin(), intervals->end());
    string fileName = getTmpName(chromosome);
    ofstream file(fileName.c_str(), ios::out | ios::binary);
    for (unsigned i = 0; i < intervals->size(); i++) {
        cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl;
        (*intervals)[i]->writeBinary(file);
    }
    file.close();
    ++counter[chromosome];
    
    sortedIntervals[chromosome] = NULL;
    delete intervals;
}


void InputFileParser::syncFiles() {
    for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
        string chromosome = iter->first;
        writeTmpFile(chromosome);
    }
}


string InputFileParser::getTmpName(const string &chromosome, unsigned int i) {
    stringstream s;
    s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp";
    return s.str();
}


string InputFileParser::getTmpName(const string &chromosome) {
    return getTmpName(chromosome, counter[chromosome]);
}


void InputFileParser::merge() {
    ofstream outputFile(outputFileName.c_str());
    for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
        merge(iter->first, outputFile);
    }
}


void InputFileParser::merge(const string &chromosome, ofstream &outputFile) {
    ifstream *files = new ifstream[counter[chromosome]];
    set<NumberIntervalType *> intervals;
    for (unsigned int i = 0; i < counter[chromosome]; i++) {
        string fileName = getTmpName(chromosome, i);
        files[i].open(fileName.c_str());
    }
    for (unsigned int i = 0; i < counter[chromosome]; i++) {
        if (files[i].good()) {
            Interval interval;
            interval.parseBinary(files[i]);
            NumberIntervalType ni = NumberIntervalType(&interval, i);
            intervals.insert(&ni);
        }
    }
    while (! intervals.empty()) {
        NumberIntervalType *ni = *intervals.begin();
        GenomicInterval gi(chromosome, ni->first->start, ni->first->end);
        outputFile << gi;
        intervals.erase(intervals.begin());
        if (files[ni->second].good()) {
            Interval interval;
            interval.parseBinary(files[ni->second]);
            NumberIntervalType nni = NumberIntervalType(&interval, ni->second);
            intervals.insert(&nni);
        }
    }
    for (unsigned int i = 0; i < counter[chromosome]; i++) {
        files[i].close();
    }
    delete[] files;
}