mkdir build |
cmake ../ |
make |
sudo make install |
#include <geco.h> using namespace geco;
//Let's instantiate a sequence retriever gLocal2bitSequenceRetriever seq_retriever("/adat/database/hg18/hg18.2bit",true); //And a Db Connection (actual values for the parameters have been hidden) gMySqlDb UCSCDb(dbname,hostname,dbuser,dbpass); //gMySqlDb UCSCDb("hg18","genome-mysql.cse.ucsc.edu","genome","");
gTranscript aTranscript=getUCSCRefseq("NM_016404",UCSCDb,seq_retriever,gRef,200,200);
// we can for example obtain the genomic length of the transcript cout << left << setw(48) << "Element length (including offsets): "; cout << aTranscript.getLength() << endl; //the element interval is returned as a 2 values gArray gElementInterval boundaries=aTranscript.getPremRNAInterval(); cout << left << setw(48) << "Transcript in element coordinates: "; cout << boundaries.getStart() << "-" << boundaries.getEnd() << endl; //we can convert it to reference coordiantes gReferenceInterval refBoundaries=aTranscript.getReferenceIntervalFromElement(boundaries); //getReferenceName returns the name of the reference used to build the object cout << left << setw(48) << "Transcript in reference coordinates: "; cout << aTranscript.getReferenceName() << ":" << refBoundaries.getStart() << "-" << refBoundaries.getEnd() << endl;
//The strand relative to the reference cout << left << setw(48) << "Strand relative to reference,(false = reverse):"; cout << aTranscript.isForward() << endl; //The number of exons cout << left << setw(48) << "Exon count:"; cout << aTranscript.getExonCount() << endl; //The length of the mRNA cout << left << setw(48) << "mRNA length:"; cout << aTranscript.getPremRNALength() << endl; //The interval of one exon boundaries=aTranscript.getExonInterval(1); cout << left << setw(48) << "Second exon element boundaries:"; cout << boundaries.getStart() << "-" << boundaries.getEnd() << endl; //Its reference coordinates refBoundaries=aTranscript.getReferenceIntervalFromElement(boundaries); cout << left << setw(48) << "Second exon reference boundaries:"; cout << aTranscript.getReferenceName() << ":" << refBoundaries.getStart() << "-" << refBoundaries.getEnd() << endl; //Its phase gArray<gShortUnsigned> phases=aTranscript.getExonPhase(1); cout << left << setw(48) << "Second exon phases:"; cout << phases[0] << "-" << phases[1] << endl; //A new gElement object representing the exon gElement anExon=aTranscript.getElement(aTranscript.getExonInterval(1)); //From which you can, for example, obtain the sequence cout << left << setw(48) << "Second exon sequence:"; cout << anExon.getSequence() << endl << endl;
// Create gVariations object using the transcript reference gVariations variations(aTranscript);
//Add some variations to the object using different set of coordinates variations.addInsertion(aTranscript.getReferenceName(),63841709,gSequence("TTTTTTTTTT")); variations.addDeletion(aTranscript,gElementInterval(400,493)); variations.addSubstitution(aTranscript.getElement(aTranscript.getExonInterval(2)),gElementInterval(60,61),gSequence("T"));
//The matrix raw data for SF2/ASF static const gScore SF2ASF_data[28]={ -1.139669, 0.621677,-1.584962, 1.323616,-1.584962,-1.584962, 0.621677, 1.365525,-1.103778, 0.725053, 0.330068, 0.938765,-1.584962,-1.584962, -0.211782, 0.174455, 0.478823,-1.584962, 0.334650, 0.992775,-0.105105, -1.584962,-0.500847,-1.584962,-1.127254,-1.584962,-1.127254, 0.268437 }; //A gMatrix object representing the PWM for SF2/ASF static const gMatrix<gScore> SF2ASF(4,7,SF2ASF_data); //the threshold to identify putative elements gScore SF2ASF_threshold=1.956; //Retriever instantiation gFeatureRetriever<gScore> sf2retriever(PWMScoreRetriever(SF2ASF,SF2ASF_threshold)); //Now we can add the feature aTranscript.addFeature<gScore>("SF2/ASF",sf2retriever,0);
gTranscript vTranscript(aTranscript,variations);
cout << "Varied Transcript Results" << endl; cout << "---------------------------------------------------------------------------------------------" << endl; boundaries=vTranscript.getPremRNAInterval(); refBoundaries=vTranscript.getReferenceIntervalFromElement(boundaries); cout << left << setw(48) << "Element length (including offsets): "; cout << vTranscript.getLength() << endl; cout << left << setw(48) << "Transcript in element coordinates: "; cout << boundaries.getStart() << "-" << boundaries.getEnd() << endl; cout << left << setw(48) << "Transcript in reference coordinates: "; cout << vTranscript.getReferenceName() << ":" << refBoundaries.getStart() << "-" << refBoundaries.getEnd() << endl; cout << left << setw(48) << "Strand relative to reference,(false = reverse):"; cout << vTranscript.isForward() << endl; cout << left << setw(48) << "Exon count:"; cout << vTranscript.getExonCount() << endl; cout << left << setw(48) << "mRNA length:"; cout << vTranscript.getmRNALength() << endl; boundaries=vTranscript.getExonInterval(1); refBoundaries=vTranscript.getReferenceIntervalFromElement(boundaries); cout << left << setw(48) << "Second exon element boundaries:"; cout << boundaries.getStart() << "-" << boundaries.getEnd() << endl; cout << left << setw(48) << "Second exon reference boundaries:"; cout << vTranscript.getReferenceName() << ":" << refBoundaries.getStart() << "-" << refBoundaries.getEnd() << endl; phases=vTranscript.getExonPhase(1); cout << left << setw(48) << "Second exon phases:"; cout << phases[0] << "-" << phases[1] << endl; //Here we do not instantiate a new gElement but obtain the sequence directly cout << left << setw(48) << "Second exon sequence:"; cout << vTranscript.getSequence(vTranscript.getExonInterval(1)) << endl << endl;
//We get the sequence corresponding to the third exon in both the transcripts: gSequence e3s=aTranscript.getSequence(aTranscript.getExonInterval(2)); gSequence ve3s=vTranscript.getSequence(vTranscript.getExonInterval(2)); //then we construct string of the same length with an asterisk where the sequences are not equal. gString mark(e3s.getLength(),'|'); gArray<gPos> diffpos=which(e3s!=ve3s); mark.setValues(diffpos,'*'); cout << "Third exon sequence comparison" << endl; cout << "---------------------------------------------------------------------------------------------" << endl; cout << "Wild-Type:\t" << e3s << endl; cout << " \t" << mark << endl; cout << "Varied: \t" << ve3s << endl;
cout << "Third exon Translated sequences comparison" << endl; cout << "---------------------------------------------------------------------------------------------" << endl; //we map the third exon interval onto the Protein sequence gElementInterval mintv=aTranscript.mapElementIntervalOnProtein(aTranscript.getExonInterval(2)); //get the corresponding protein sequence cout << "Wild-Type:\t" << aTranscript.getProteinSequence(mintv) << endl; //then we do the same for the varied transcript mintv=vTranscript.mapElementIntervalOnProtein(vTranscript.getExonInterval(2)); cout << "Varied: \t" << vTranscript.getProteinSequence(mintv) << endl;
//We get the element representing the 3rd exon gElement wtExon3=aTranscript.getElement(aTranscript.getExonInterval(2)); //then we retrieve the feature values for this element gArray<gScore> wtScores=wtExon3.getFeature<gScore>(0,gElementInterval(0,wtExon3.getLength())); //Finally we search for non null values gArray<gPos> wtHits=which(wtScores>(gScore)0); cout << "\"Wild-type\" third exon SF2/ASF elements:\nPosition\tScore" << endl; cout << "------------------------" << endl; for(gSize i=0;i<wtHits.getSize();i++){ cout << wtHits[i] << "\t\t" << wtScores[wtHits[i]] << endl; } cout << endl << endl;
gElement vExon3=vTranscript.getElement(vTranscript.getExonInterval(2)); gArray<gScore> vScores=vExon3.getFeature<gScore>(0,gElementInterval(0,vExon3.getLength())); gArray<gPos> vHits=which(vScores>(gScore)0); //Output the results cout << "\"Mutated\" third exon SF2/ASF genomic positions:\nPosition\tScore" << endl; cout << "------------------------" << endl; for(gSize i=0;i<vHits.getSize();i++){ cout << vHits[i] << "\t\t" << vScores[vHits[i]] << endl; } cout << endl << endl; //Check if they are equal if(vScores.equals(wtScores)){ cout << "vScores and wtScores are the same!" << endl << endl; }else{ cout << "vScores and wtScores differ!" << endl << endl; }
gElement wtESE(wtExon3,wtHits[5],wtHits[5]+7); gElement vESE(vExon3,wtHits[5],wtHits[5]+7); cout << "The wt binding site: \t" << wtESE.getSequence() << endl; cout << "The varied binding site:\t" << vESE.getSequence() << endl; refBoundaries=wtESE.getReferenceInterval(); cout << "ESE Reference coordinates:\t"; cout << wtESE.getReferenceName() << ":" << refBoundaries.getStart() << "-" << refBoundaries.getEnd() << endl;
cout << "Array size: " << AnArray.getSize() << endl; cout << "Number of NAs in the array: " << AnArray.NACount() << endl; cout << "Array value at position 0: " << AnArray[0] << endl; cout << "NA status at position 0:" << AnArray.isNA(0) << endl; cout << "Array value at position 8: " << AnArray[8] << endl; cout << "NA status at position 8:" << AnArray.isNA(8) << endl;
template <class T> ostream & operator << (ostream & out,const gArray<T> & array) { size_t lfield; if(typeid(T) == typeid(gScore) ){ lfield=7; cout.precision(1); cout << fixed; }else{ lfield=5; } string sep(4+lfield*array.getSize(),'-'); out << "Pos:"; for (gPos i=0;i<array.getSize();i++) { out << setw(lfield) << i; } out << endl << sep.c_str() << endl; out << "Val:"; for (gPos i=0;i<array.getSize();i++) { if(array.isNA(i)) out << setw(lfield) << "*"; else out << setw(lfield) << array[i]; } out << endl; return out; }
cout << "Elements of the array:\n" << AnArray << endl; cout << "Positions of the elements to retrieve:\n" << AnArrayOfPositions << endl; cout << "Array values at the given positions:\n" << AnArray[AnArrayOfPositions] << endl; cout << "An Array of characters:\n" << AnArrayOfCharacters << endl; cout << "Array values indexed by an array of characters:\n" << AnArray[AnArrayOfCharacters] << endl;
gArray<gShortUnsigned> A(15);
gArray<gShortUnsigned> A(45,10,false); cout << "Array directly instantiated with a given length (10)" << endl; cout << "and initialized to a default value (45) and NA status (false):\n"<< A << endl; gArray<gShortUnsigned> B(45,10,false,false); cout << "Array directly instantiated with a given length (10)" << endl; cout << "without initialization:\n"<< B << endl;
//raw data initialization gShortUnsigned raw_data[10]; for (int i=0;i<10;i++) raw_data[i]=rand() % 1000 + 1;
//Using raw (random in this case) data to initialize an array gArray<gShortUnsigned> A(raw_data,10); cout << "An array initialized with 10 random values:\n" << A << endl;
gPos raw_positions[3]={0,3,7}; gArray<gShortUnsigned> A(raw_data,10,gArray<gPos>(raw_positions,3)); cout << "An array initialized with 10 random values and 3 NA positions:\n" << A << endl; gArray<gShortUnsigned> B(raw_data,10,5); cout << "An array initialized with 10 random values and 1 NA position:\n" << B << endl;
gArray<gShortUnsigned> A(raw_data,10,5); cout << "The array to copy from:\n" << A << endl; //Simple copy gArray<gShortUnsigned> B(A); cout << "A copied array:\n" << B << endl; //Range copy gArray<gShortUnsigned> C(A,3,8); cout << "A range copied array:\n" << C << endl;
gArray<gShortUnsigned> A; cout << "The size of an empty array:\t" << A.getSize() << endl;
gArray<gShortUnsigned> A; A.setValue(0,15); cout << "After assigning a value at position 0:\n" << A << endl; A.setValue(4,25,false); cout << "After assigning a value at position specifying NA status:\n" << A << endl; A.setValue(9,35,true,3,false); cout << "After assigning a value at position 9 specifying fillvalue and fillNA parameters:\n"<< A << endl;
gArray<gShortUnsigned> B; B.setValue(9,A,5,4,false); cout << "Source array:\n" << A << endl; cout << "Destination Array:\n" << B << endl;
gArray<gShortUnsigned> A; A.setData(10,raw_data,3); cout << "After setting values using setData:\n" << A << endl;
// Positions array initialization gArray<gPos> positions; positions.setValue(0,0); positions.setValue(1,2,true); positions.setValue(2,4); positions.setValue(3,6); positions.setValue(4,8); positions.setValue(5,10); cout << "The positions:\n" << positions << endl; // Values array initialization gArray<gShortUnsigned> srcValues; srcValues.setValue(0,10); srcValues.setValue(1,20); srcValues.setValue(2,30,true); cout << "The values:\n" << srcValues << endl; // An array is instantiated with seven values initialized to the value 5 gArray<gShortUnsigned> dstValues(5,7); cout << "The array before copying:\n" << dstValues << endl; dstValues.setValues(positions,srcValues,0,false); cout << "The array after copying:\n" << dstValues << endl;
gArray<gShortUnsigned> A,B; A=57; B=A;
//Let's instantiate an empty array gArray<gShortUnsigned> A; //and an array of values (it's woth noting the implicit use of a single value constructor to specify the NApositions parameter gArray<gShortUnsigned> B(raw_data,5,4); cout << "The values of B:\n" << B << endl; A+=B; cout << "A+=B is empty if A is empty:\n" << A << endl; //now let's set some values of A A.setValue(9,5,false,5,false); cout << "The initial values of A:\n" << A << endl; // and apply the same operator as before A+=B; cout << "Values of A after A+=B:\n" << A << endl; //The Array to be assigned can also be specified via an implicit single value constructor A/=2; cout << "Values of A after A/=2:\n" << A << endl;
gArray<gShortUnsigned> A(raw_data,5); gArray<gShortUnsigned> B(raw_data+5,5,3); gArray<gShortUnsigned> C=A; cout << "The array to be modified:\n" << A << endl; cout << "The array used for modifications:\n" << B << endl; //Array concatenation A=C; A.concatenate(B); cout << "A concatenated with B:\n" << A << endl; //Array insertion A=C; A.insert(3,B); cout << "A with B inserted at position 3:\n" << A << endl; //Array values replacement A=C; A.replace(2,B,1,3); cout << "The values of A replaced starting from position 2 with the values" << endl; cout << "of B between 1 and 3(1 based):\n" << A << endl; //Array values removal A=C; A.remove(1,2); cout << "After removing two values starting from position 1:\n" << A << endl; //Array values reversion A=C; A.revert(1,4); cout << "A after reverting elements in the range [1,4):\n" << A << endl; //Array sorting A=C; gArray<gPos> D=A.sort(true); cout << "A after sorting:\n" << A << endl; cout << "the original positions of the sorted elements:\n" << D << endl;
gArray<gShortUnsigned> A(raw_data,10,4); A.setValue(8,384,false); cout << "An array A:\n" << A << endl; cout << "Positions where the value 384 occurs:\n" << A.find(384) << endl; cout << "Positions where the value 794 occurs:\n" << A.find(794) << endl;
//We are using the same A array instantiated in the previous example cout << "Counting occurrences of 384 in A:\n" << A.getCounts(384) << endl; //With the following we counts occurrences of the specified value in 4 //elements windows with offset set to 0. cout << "Counting occurrences of 384 in windows of length 4:\n" << A.getCounts(384,4,0) << endl; //Now we do the same but specifying a subset and skipnan set to false cout << "Counting occurrences of 384 in windows of length 4" << endl; cout << "in the subset [0,6) and with skipnan==false:\n" << A.getCounts(384,4,0,0,6,false) << endl; //Let's instantiate a result array gArray<gSize> counts(0,A.getSize(),true); A.getCounts(counts,384,4,2); cout << "Counting occurrences of 384 in windows of length 5 (passing the result array):\n" << counts << endl;
gArray<gShortUnsigned> B(0,2,false); B.setValue(0,778); B.setValue(1,384); cout << "Counting occurrences of 778 and 384:\n" << A.getCounts(B) << endl; cout << "Counting occurrences of 778 or 384:\n" << A.getCounts(B,true) << endl; gArray<gShortUnsigned> C=A.getUnique(); cout << "Non NA values in A:\n" << C << endl; cout << "Counting occurrences of values in A:\n" << A.getCounts(C) << endl;
gArray<gShortUnsigned> A(raw_data,5); cout << "The array A:\n" << A << endl; //Increment of array elements gArray<gShortUnsigned> B=A+1; cout << "B=A+1:\n" << B << endl; //Division gArray<gShortUnsigned> C(raw_data+5,5); cout << "The array C:\n" << C << endl; gArray<gShortUnsigned> D=B/C; cout << "D=B/C:\n" << D << endl; //that is equivalent to (in a single expression) cout << "(A+1)/C:\n" << (A+1)/C << endl;
gArray<gShortUnsigned> A(raw_data,5); cout << "The array A:\n" << A << endl; gArray<gShortUnsigned> B=A>=300; cout << "B = A>=300:\n" << B << endl; gArray<gShortUnsigned> C=A<400; cout << "C = A<400:\n" << C << endl; gArray<gShortUnsigned> D= B && C; cout << "D = B && C:\n" << D << endl; gArray<gPos> result=which(!D); cout << "result = which(!D):\n" << result << endl; //that is equivalent to (in a single expression) cout << "which( !( (A>=300) && (A<400) ) ):\n" << which( !( (A>=300) && (A<400) ) ) << endl;
gArray<_short> A((_short *) raw_data,10,4); A.setValue(5,-100,false); cout << "The Array A:\n" << A << endl; gArray<_short> B=apply((gArray<gScore>)A,&log10)*10; cout << "B containing A elements base 10 logs:\n" << B << endl;
gArray<gShortUnsigned> A(raw_data,10,4); cout << "The Array A:\n" << A << endl; cout << "The minimum of the elements in A:\n" << A.getMin() << endl; cout << "The minimum of the elements in A" << endl; cout << "specifing a range [0,5):\n" << A.getMin(0,5) << endl; cout << "The maximum of the elements in A:\n" << A.getMax() << endl; cout << "The maximum of the elements in A" << endl; cout << "with skipnan set to false:\n" << A.getMax(0,0,false) << endl;
//Windowed calculations cout << "The Array A:\n" << A << endl; cout << "The sum of the elements in sliding windows" << endl; cout << "of length 3 with an offset of 1, skipping NAs:\n" << A.getSum(0,0,true,3,1) << endl; cout << "The mean of the elements in sliding windows" << endl; cout << "of length 3 and offset of 1, skipping NAs:\n" << A.getMean(0,0,true,3,1) << endl; cout << "The mean of the elements sliding windows" << endl; cout << "of length 3 and offset of 1, without skipping NAs:\n" << A.getMean(0,0,false,3,1) << endl; }
gSequence seq("ACGATCGACTAGCATCGACA"); cout << "Sequence: "<< seq << endl;
gSequence seq("ACGATCGACTAGCATCGACA"); cout << "Sequence: "<< seq << endl; cout << "Total number of C and A occurrences: " << endl << seq.getCounts(gString("CA")) << endl; cout << "Number of C occurrences in a sliding 4bp window: " << endl << seq.getCounts('C',4) << endl; cout << "Positions of C occurrences:" << endl << which( ((gArray<gChar>)seq)=='C') << endl;
gSequence seq("AAACCC"); cout << "The original sequence: " << seq << endl; seq.insert(2,gSequence("TTT")); cout << "After inserting TTT at position 2: " << seq << endl; seq.remove(5,2); cout << "After removing 2 bases from position 5: " << seq << endl; seq.replace(1,gSequence("GGG")); cout << "After replacing positions from 1 with GGG: " << seq << endl;
gSequence seq1="AAAA"; gSequence seq2=string("CCCC"); gSequence seq3=seq1+seq2+"GGGG"+string("TTTT"); cout << seq3.getLower() << endl; cout << seq3 << endl;
gSequence seq("TGATGTGTTGAGCATCGACA"); cout << "A sequence: " << seq << endl; cout << "Its reverese-complement: " << seq.getReverseComplement() << endl; cout << "Its AA translation, frame=0: " << seq.getTranslated(0) << endl; cout << "Its AA translation, frame=1: " << seq.getTranslated(1) << endl; cout << "Its AA translation, frame=2: " << seq.getTranslated(2) << endl;
class gSequenceRetrieverExampleImplementation:public gSequenceRetrieverImplementation{ private: //String containing the path to the files gString fileprefix; //String containing the files suffix gString filepostfix; virtual gArrayRetrieverImplementation<gChar> * clone() const; virtual gSequence getSequence_Internal(const gString & reference,gPos start,gPos end) const; public: gSequenceRetrieverExampleImplementation(const gString & prefix,const gString & postfix); virtual ~gSequenceRetrieverExampleImplementation(); };
gSequenceRetrieverExampleImplementation::gSequenceRetrieverExampleImplementation(const gString & prefix,const gString & postfix):gSequenceRetrieverImplementation(){ fileprefix=prefix; filepostfix=postfix; } gSequenceRetrieverExampleImplementation::~gSequenceRetrieverExampleImplementation(){ } gArrayRetrieverImplementation<gChar> * gSequenceRetrieverExampleImplementation::clone() const{ return new gSequenceRetrieverExampleImplementation( ((string) fileprefix).c_str() ,((string) filepostfix).c_str() ); }
#include <fstream> //include this for file I/O operations gSequence gSequenceRetrieverExampleImplementation::getSequence_Internal(const gString & reference,gPos start,gPos end) const{ string filename=fileprefix+reference+filepostfix; // obtains the complete filename ifstream filein(filename.c_str()); // and open it for reading if(filein.is_open()){ // check if it's actually opened // if opened gSequence tmp(end-start,'N'); // instantiates a gSequence of the required length filein.seekg(start,ios_base::beg); // offsets the file pointer to the starting position filein.get(getArrayDataPtr(tmp),end-start+1); // gets the data directly into the object memory return tmp; // returns the gSequence }else{ // otherwise return gSequence(); // returns an empty gSequence; // alternatively we could throw a gException } }
gSequenceRetriever retriever(gSequenceRetrieverExampleImplementation("/adat/database/hg18/",".txt")); gSequence seq=retriever.getSequence("chrX",1789568,1789618); cout << "chrX:1789568-1789618\t" << seq << endl;
gElement elementA("chrX",1789568,1789618,true,seq,gRef); gElement elementB("chrX",1789568,1789618,true,retriever,gRef); cout << "Element A: " << elementA << endl; cout << "Element B: " << elementB << endl;
gElement subelementA(elementA,gElementInterval(-10,10),true); gElement subelementB(elementB,gElementInterval(-10,10),true); cout << "Sub-element A:" << subelementA << endl; cout << "Sub-element B:" << subelementB << endl;
gElement element1("chr1",5,45); gElement element2("chr1",gReferenceInterval(5,45)); gElement element3("chr1",5,45,false);
ostream & operator << (ostream & out,const gElement & element) { //You can obtain the reference name... gString refName=element.getReferenceName(); //the reference interval where this element maps... gReferenceInterval refIntv=element.getReferenceInterval(); //the element interval (in element coordinates)... gElementInterval elmIntv=element.getElementInterval(); //the strand relative to the reference... bool strand=element.isForward(); //the length of the element... gSize elementLen=element.getLength(); //and the element's sequence gSequence eseq=element.getSequence(); //details about output formatting are omitted. //...formatting output...
cout << "element1:" << element1 << endl; cout << "element2:" << element2 << endl; cout << "element3:" << element3 << endl;
//some example of elements instantiated as sub-elements. gElement element4(element1,10,20); gElement element5(element1,gElementInterval(-4,46)); gElement element6(element3,gElementInterval(15,25),false); //obtaining a new gElement using the getElement function (with offsets) gElement element7=element1.getElement(gElementInterval(3,8),false,-1,1); //You can do the same for the other three instances we have cout << "element4:" << element4 << endl; cout << "element5:" << element5 << endl; cout << "element6:" << element6 << endl; cout << "element7:" << element7 << endl;
//Make a copy of a gElement gElement element8=element5; //Instantiate and populate a std::vector of gElement objects vector<gElement> elementVector; elementVector.push_back(element1); elementVector.push_back(element2); elementVector.push_back(element3); elementVector.push_back(element4); elementVector.push_back(element5); elementVector.push_back(element6); elementVector.push_back(element7); elementVector.push_back(element8); //output the elements we've instantiated for(size_t i=0;i<elementVector.size();i++){ cout << "Element" << i+1 << elementVector[i] << endl; }
//An element instantiated providing sequence information gElement element1("myRef",10,20,true,"ACGTGACGTA",gElm); //A sub element gElement element2(element1,-10,10,false); //An element instantiated using a retriever (suppose we've instantiated it before) gElement element3("chr1",10100,10150,true,retriever,gElm); //A Sub-Element gElement element4(element3,-10,10,false); cout << "Element with sequence:" << element1 << endl; cout << "Sub element [-10,10), reverse" << element2 << endl; cout << "Element with retriever:" << element3 << endl; cout << "Sub-element [-10,10], reverse" << element4 << endl;
//A gVariation from reference name; gVariations var1("chr1"); //Another one whose reference is obtained from a gElement gElement element("chr1",5,45); gVariations var2(element); //gVariations objects can be copied using the copy constructor... gVariations var4(var2); //or using the assignment operator gVariations var3=var1;
gVariations var("ref1"); var.addInsertion("ref1",30,"ACGTTGTG"); var.addDeletion("ref1",gReferenceInterval(10,20)); var.addSubstitution("ref1",gReferenceInterval(40,41),"A");
cout << "variation reference:" << var.getReferenceName() << endl; cout << "Number of variations: " << setw(10) << var.getCount() << endl << endl; cout << "N#" << setw(10) << "Start" << setw(10) << "End" << setw(10) << "Type" << setw(14) << "Sequence" << endl; for(gSize i=0;i<var.getCount();i++){ cout << i; cout << setw(10) << var.getVariationStart(i); cout << setw(10) << var.getVariationEnd(i); cout << setw(10) << var.getVariationType(i); cout << setw(15) << var.getVariationSequence(i) << endl; } cout << endl << "type legend: 0=Substitution, 1=Insertion, 2=Deletion" << endl;
gElement Element("ref1",5,45,true,"ATATCTGCAGCTTCTCCTTCATCTTCATCTTCTTCCTCTT",gRef); gElement vElement1(Element,Element.getElementInterval(),true,var); gElement vElement2(vElement1,gElementInterval(0,20),true); gElement vElement3("ref1",gReferenceInterval(15,35),false,var); cout << "Original element" << Element << endl; cout << "Varied element 1" << vElement1 << endl; cout << "Varied element 2" << vElement2 << endl; cout << "Varied element 3" << vElement3 << endl;
//An array of reference positions gArray<gPos> referencePositions; referencePositions.setValue(0,2,false); referencePositions.setValue(1,15,false); referencePositions.setValue(2,26,false); referencePositions.setValue(3,47,false); cout << "An array of reference positions" << endl; cout << referencePositions << endl; //Mapping of these positions onto different elements cout << "The corresponding positions on Element," << endl; cout << Element.getElementPositionsFromReference(referencePositions) << endl; cout << "on vElement1," << endl; cout << vElement1.getElementPositionsFromReference(referencePositions) << endl; cout << "and on vElement3" << endl; cout << vElement3.getElementPositionsFromReference(referencePositions) << endl;
//definition of two reference intervals. gReferenceInterval rintv1(2,15); gReferenceInterval rintv2(15,26); cout << "Reference interval: " << rintv1 << endl; cout << "-----------------------------------------------------------" << endl; cout << "maps to Element interval: " << Element.getElementIntervalFromReference(rintv1) << endl; cout << "maps to vElement1 interval: " << vElement1.getElementIntervalFromReference(rintv1) << endl; cout << "maps to vElement3 interval: " << vElement3.getElementIntervalFromReference(rintv1) << endl << endl; cout << "Reference interval: " << rintv2 << endl; cout << "-----------------------------------------------------------" << endl; cout << "maps to Element interval: " << Element.getElementIntervalFromReference(rintv2) << endl; cout << "maps to vElement1 interval: " << vElement1.getElementIntervalFromReference(rintv2) << endl; cout << "maps to vElement3 interval: " << vElement3.getElementIntervalFromReference(rintv2) << endl;
//An array of element positions; gArray<gRelativePos> elementPositions; elementPositions.setValue(0,-13); elementPositions.setValue(1,8); elementPositions.setValue(2,27); cout << "vElement3 Positions: " << endl; cout << elementPositions << endl; cout << "map to the following vElement1 positions: " << endl; cout << vElement3.mapElementPositions(vElement1,elementPositions) << endl << endl; cout << "and to the following reference positions: " << endl; cout << vElement3.getReferencePositionsFromElement(elementPositions) << endl; //two element intervals gElementInterval eintv1(8,27); gElementInterval eintv2(0,8); //obtaining corresponding intervals on another element gElementInterval eintv3=vElement3.mapElementInterval(vElement1,eintv1); gElementInterval eintv4=vElement3.mapElementInterval(vElement1,eintv2); cout << "vElement3 interval: " << eintv1; cout << "maps to vElement1 interval: " << eintv3 << endl; cout << "vElement3 interval: " << eintv2; cout << "maps to vElement1 interval: " << eintv4 << endl << endl; cout << "vElement3 interval: " << eintv1; cout << "maps to reference interval: " << vElement3.getReferenceIntervalFromElement(eintv1) << endl; cout << "vElement3 interval: " << eintv2; cout << "maps to reference interval: " << vElement3.getReferenceIntervalFromElement(eintv2) << endl << endl; cout << "vElement1 interval: " << eintv3; cout << "maps to reference interval: " << vElement1.getReferenceIntervalFromElement(eintv3) << endl; cout << "vElement1 interval: " << eintv4; cout << "maps to reference interval: " << vElement1.getReferenceIntervalFromElement(eintv4) << endl;
//Instantiate an element and add two sites gElement element("chr1",10,20,true,"ACGTTTACGT"); gArray<gArrayIndex> A=element.insertElementSites(3); gArray<gArrayIndex> B=element.insertReferenceSites(18); cout << "Element" << element << endl; //Some variations gVariations variations(element); variations.addDeletion(element,gElementInterval(1,5)); variations.addInsertion("chr1",16,"AAAA"); //A copy of the element adding variations gElement vElement(element,0,10,true,variations); cout << "vElement" << vElement << endl; //Fetch site A positions gArray<gRelativePos> pos1=element.getSitesElementPosition(A); gArray<gRelativePos> pos2=vElement.getSitesElementPosition(A); cout << "Site A:\t"; cout << pos1[0] << " -> "; if(pos2.isNA(0)){ cout << "(" << pos2[0] << ")" << endl; }else{ cout << pos2[0] << endl; } //Fetch site B positions pos1=element.getSitesElementPosition(B); pos2=vElement.getSitesElementPosition(B); cout << "Site B:\t"; cout << pos1[0] << " -> "; if(pos2.isNA(0)){ cout << "(" << pos2[0] << ")" << endl; }else{ cout << pos2[0] << endl; }
//Using the same element of the previous example we add a third site gArray<gArrayIndex> C=element.insertElementSites(6); //Yhen we insert a connection specifying the sites gArray<gArrayIndex> connAC=element.connectSites(A,C); //And also specifying an interval and its orientation (sites are added if not already present) gArray<gArrayIndex> connBC=element.insertElementConnection(gElementInterval(6,9),false); //You can know the number of sites and connections cout << "element contains " << element.getSitesCount() << " sites and "; cout << element.getConnectionsCount() << " connections" << endl << endl; //For each connection you can obtain the corresponding element or reference interval //as well as the corresponding orientation cout << "Element interval for connection AC:"; cout << element.getConnectionElementInterval(connAC[0]); cout << ((element.getConnectionElementStrand(connAC[0]))?(" Forward"):(" Reverse")) << endl; cout << "Element interval for connection BC:"; cout << element.getConnectionElementInterval(connBC[0]); cout << ((element.getConnectionElementStrand(connBC[0]))?(" Forward"):(" Reverse")) << endl; cout << "Reference interval for connection AC:"; cout << element.getConnectionReferenceInterval(connAC[0]); cout << ((element.getConnectionReferenceStrand(connAC[0]))?(" Forward"):(" Reverse")) << endl; cout << "Reference interval for connection BC:"; cout << element.getConnectionReferenceInterval(connBC[0]); cout << ((element.getConnectionReferenceStrand(connBC[0]))?(" Forward"):(" Reverse")) << endl; cout << endl; //You can also know which sites a given site is connected to/from: cout << "Site C connects from the following sites:" << endl << element.connectsFrom(C[0]) << endl; cout << "Site C doesn't connects to any site:" << endl << element.connectsTo(C[0]) << endl; cout << "Site A connects to the following sites:" << endl << element.connectsTo(A[0]) << endl; cout << "Site B connects to the following sites:" << endl << element.connectsTo(B[0]) << endl;
gElement element2(element,0,10,false,variations); cout << element2 << endl; // Now we can reproduce the steps done before to analyse the connections for this new element //.. code omitted, same as previous snippet except that we are using element2 instead of element
gElement elementAC=element.getElement(connAC[0]); cout << "elementAC" << elementAC << endl; gElement elementBC=element.getElement(connBC[0]); cout << "elementBC" << elementBC << endl; gElement elementAC2=element2.getElement(connAC[0]); cout << "elementAC2" << elementAC2 << endl; gElement elementBC2=element2.getElement(connBC[0]); cout << "elementBC2" << elementBC2 << endl;
gFeatureRetriever<gShortUnsigned> retriever(GCRetriever(1,2)); gElement element("chr3",113743339,113743370,true,"CTTCCATATCTGCAGCTTCTCCTTCATCTTC"); cout << retriever.getFeature(element,element.getElementInterval(),-1) << endl;
class GCRetriever:public gFeatureRetrieverImplementation<gShortUnsigned>{ private: //The virtual function that calculates the feature virtual bool calculateFeature(gArray<gShortUnsigned> & values,const gElement & element,const gElementInterval & interval) const; //The virtual function used to obtain a copy of this object virtual gArrayRetrieverImplementation<gShortUnsigned> * clone() const; public: //The constructor GCRetriever(gRelativePos startOffset,gRelativePos endOffset); //The virtual destructor virtual ~GCRetriever(); };
GCRetriever::GCRetriever(gRelativePos startOffset,gRelativePos endOffset):gFeatureRetrieverImplementation<gShortUnsigned>(startOffset,endOffset,true){ } GCRetriever::~GCRetriever(){ } gArrayRetrieverImplementation<gShortUnsigned> * GCRetriever::clone() const{ return new GCRetriever(getStartOffset(),getEndOffset()); }
bool GCRetriever::calculateFeature(gArray<gShortUnsigned> & values,const gElement & element,const gElementInterval & interval) const{ //Obtain the sequence corresponding to the specified interval gArray<gChar> seq=element.getSequence(interval); //Build an array that has 1 at G or C positions. gArray<gShortUnsigned> g_or_c=((seq=='C')||(seq=='G')); //Use the getSum function to calculate the number of "1" in a sliding window g_or_c.getSum(values,0,seq.getSize(),false,getWindowLenght(),getStartOffset()); return true; }
gElement element("chr3",113743339,113743370,true,gSequenceRetriever(gSequenceRetrieverExampleImplementation("/adat/database/hg18/",".txt"))); gFeatureRetriever<gShortUnsigned> retriever(GCRetriever(1,2)); cout << retriever.getFeature(element,element.getElementInterval(),-1) << endl;
gElement element("chr3",113743339,113743370,true,gSequenceRetriever(gSequenceRetrieverExampleImplementation("/adat/database/hg18/",".txt"))); gFeatureRetriever<gShortUnsigned> retriever(GCRetriever(1,2)); cout << "The original element:" << element << endl; cout << retriever.getFeature(element,element.getElementInterval(),-1) << endl; gElement sElement(element,0,10,true); cout << "A sub-element:" << sElement << endl; cout << retriever.getFeature(sElement,sElement.getElementInterval(),-1) << endl; gVariations var(element); var.addDeletion(element,gElementInterval(13,18)); gElement vElement(element,0,31,true,var); cout << "A Varied sub-element:" << vElement << endl; cout << retriever.getFeature(vElement,vElement.getElementInterval(),-1) << endl;
gElement element("chr3",113743339,113743370,true,gSequenceRetriever(gSequenceRetrieverExampleImplementation("/adat/database/hg18/",".txt"))); gFeatureRetriever<gShortUnsigned> retriever(GCRetriever(1,2)); gArrayIndex featnum=element.addFeature<gShortUnsigned>("GC",retriever,-1); cout << "The original element:" << element << endl; cout << element.getFeature<gShortUnsigned>(featnum,element.getElementInterval()) << endl; gElement sElement(element,0,10,true); cout << "A sub-element:" << sElement << endl; cout << sElement.getFeature<gShortUnsigned>(featnum,sElement.getElementInterval()) << endl; gVariations var(element); var.addDeletion(element,gElementInterval(13,18)); gElement vElement(element,0,31,true,var); cout << "A Varied sub-element:" << vElement << endl; cout << vElement.getFeature<gShortUnsigned>(featnum,vElement.getElementInterval()) << endl;
#include <fstream> #include <mysql++/mysql++.h> #include <geco.h> using namespace std; using namespace mysqlpp; using namespace geco;
class gVcf { public: vector<gString> sub_names; vector<gVariations> v1; vector<gVariations> v2; gVcf(const gString & filename,const gElement & element); };
gVcf::gVcf(const gString & filename,const gElement & element) { static char buffer[1000]; gString chrom,Id,Alt,Qual,Filter,Info,Format; gPos pos; gSequence Ref; gReferenceInterval intv=element.getReferenceInterval(); gString nref(element.getReferenceName(),3); ifstream fin(((string) filename).c_str()); if (fin.is_open()) { //Skipping metadata do { fin.getline(buffer,1000,'\n'); } while ((!fin.eof())&&(buffer[1]=='#')); //if header found if (!fin.eof()) { //construnct a gString skipping the first '#' character gString line=buffer+1; //split the header string to obtain a vector with field names vector<gString> header; line.split('\t',header); //fill sub_names with subject names and instatiate two gVariations each for (gSize i=9;i<header.size();i++) { sub_names.push_back(header[i]); v1.push_back(gVariations(element)); v2.push_back(gVariations(element)); } //reads data lines until an eof is found while (!fin.eof()) { //read the first 9 fields (we're interested here in chrom and pos) fin >> chrom >> pos >> Id >> Ref >> Alt >> Qual >> Filter >> Info >> Format; //stop the cycle if we reach the eof if(fin.eof()) break; //positions are 1 based in vcf files so we make them 0 based pos-=1; //check if the variant is inside our region otherwise skip the line if ((chrom==nref)&&(pos>=intv.getStart())&&(pos<intv.getEnd())) { //construct a vector containing alleles for the current variant vector<gString> alleles; //add the referenfce allele (its index will be 0) alleles.push_back(Ref); //and the alternative ones (could be more than one) Alt.split(',',alleles); //read the rest of the line and split it into subject's genotypes fin.getline(buffer,1000,'\n'); line=buffer; vector<gString> genotypes; line.split('\t',genotypes); // for each genotype (i.e. subject): for (size_t i=1;i<genotypes.size();i++) { // get the first allele (here we used the ASCII code subtracting the code // for '0' (48), this works because alleles are less than 10) size_t g1=genotypes[i][0]-48; // and the second one size_t g2=genotypes[i][2]-48; // if the first allele is not reference add the proper variation if (g1>0) { //ref and alt alleles have the same length: Substitution if (alleles[g1].getSize()==alleles[0].getSize()) { v1[i-1].addSubstitution(element.getReferenceName(), gReferenceInterval(pos,pos+alleles[g1].getLength()), alleles[g1] ); //alt allele is longer than referernce one: Insertion } else if (alleles[g1].getSize()>alleles[0].getSize()) { v1[i-1].addInsertion(element.getReferenceName(), pos, gString(alleles[g1],1) ); //alt allele is shorter than referernce one: Deletion } else { v1[i-1].addDeletion(element.getReferenceName(), gReferenceInterval(pos+1,pos+alleles[0].getLength()) ); } } // do the same for the second allele if (g2>0) { if (alleles[g2].getSize()==alleles[0].getSize()) { v2[i-1].addSubstitution(element.getReferenceName(),gReferenceInterval(pos,pos+alleles[g2].getLength()),alleles[g2]); } else if (alleles[g2].getSize()>alleles[0].getSize()) { v2[i-1].addInsertion(element.getReferenceName(),pos,gString(alleles[g2],1)); } else { v2[i-1].addDeletion(element.getReferenceName(),gReferenceInterval(pos+1,pos+alleles[0].getLength())); } } } } else fin.getline(buffer,1000,'\n'); } } } }
class gLocalTxtSequenceRetriever:public gSequenceRetrieverImplementation { private: gString i_path; gString i_extension; virtual gArrayRetrieverImplementation<gChar> * clone() const; virtual gSequence getSequence_Internal(const gString & reference,gPos start,gPos end) const; protected: public: gLocalTxtSequenceRetriever(const gString & path,const gString & extension=".txt"); virtual ~gLocalTxtSequenceRetriever(); };
gLocalTxtSequenceRetriever::gLocalTxtSequenceRetriever(const gString & path,const gString & extension):gSequenceRetrieverImplementation() { i_path=path; i_extension=extension; }
gLocalTxtSequenceRetriever::~gLocalTxtSequenceRetriever() { } gArrayRetrieverImplementation<gChar> * gLocalTxtSequenceRetriever::clone() const { return new gLocalTxtSequenceRetriever(i_path,i_extension); }
gSequence gLocalTxtSequenceRetriever::getSequence_Internal(const gString & reference,gPos start,gPos end) const { gSequence ret; string filename=i_path+reference+i_extension; ifstream filein(filename.c_str()); if (filein.is_open()) { gChar *Buffer=new gChar[end-start+1]; Buffer[end-start]=0; filein.seekg(start,ios_base::beg); filein.get(Buffer,end-start+1); ret=gSequence(gString(Buffer)); delete [] Buffer; } return ret; }
struct dbInfo { string host; int port; string dbname; string user; string passwd; };
gTranscript getUCSCRefseq(const gString & refseqID,const dbInfo & info,const gSequenceRetriever & retriever,gElementSequenceMode mode,gPos startOffset,gPos endOffset) { //The object we will return gTranscript ret; //a Database conncetion Connection * conn=NULL; try { //Connects to the database conn= new Connection(info.dbname.c_str(),info.host.c_str(),info.user.c_str(),info.passwd.c_str(),info.port); //if succeeded if (conn) { //setup and run the query Query myquery = conn->query(); myquery << "select "; myquery << " name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds "; myquery << "from "; myquery << " refGene "; myquery << "where "; myquery << " refGene.name='" << refseqID << "'"; StoreQueryResult res = myquery.store(); //if we have one and only one result if (res.size()==1) { //get the first row in the results Row row = *res.begin(); //get the cds start,end, number of exons gPos cdsStart = atol(row.at(5).data()); gPos cdsEnd = atol(row.at(6).data()); gSize ecount = atoi(row.at(7).data()); //declare exon start and end position arrays gArray<gPos> estart,eend; //exonStarts and exonEnds are given as text fields //with comma separated positions //copy them into two strings for further processing string es = row.at(8).data(); string ee = row.at(9).data(); //close and destroy the connection (we have extracted all we need) conn->disconnect(); delete conn; //split the two position lists filling the estart and eend position arrays for (int i=0;i<ecount;i++) { size_t p1 = es.find_first_of(','); size_t p2 = ee.find_first_of(','); estart.setValue(i,atol(es.substr(0,p1).c_str()),false); eend.setValue(i,atol(ee.substr(0,p2).c_str()),false); es = es.substr(p1+1); ee = ee.substr(p2+1); } //now we can instantiate a gTranscript ret=gTranscript(row.at(1).data(),estart,eend,strcmp(row.at(2).data(),"+")==0,retriever,mode,startOffset,endOffset,cdsStart,cdsEnd); } else { // if no results or more than one // close and destroy the connection conn->disconnect(); delete conn; //throw appropriate exceptions if (res.size()==0) throw gException("Invalid UCSC refSeq identifier"); else throw gException("Multiple match for refSeq identifier"); } } else { // the connection was not ok, we throw an exception to signal this condition throw gException("Unable to connect to the database"); } } catch (exception & e) { //if something unexpected occurred, throwing a known exception object: //close and destroy the connection if (conn) { conn->disconnect(); delete conn; } //throw a gException to signal what happened throw gException(e.what()); } catch (...) { //if something unexpected occurred, throwing an unknown exception object: //close and destroy the connection if (conn) { conn->disconnect(); delete conn; } //throw a gException to signal an unknown error throw gException("unknown error"); } //return the object if no exception occurred return ret; }
class PWMScoreRetriever:public gFeatureRetrieverImplementation<gScore> { private: gMatrix<gScore> i_mat; gScore i_threshold; gArray<gPos> i_positions; virtual bool calculateFeature(gArray<gScore> & values,const gElement & element,const gElementInterval & interval) const; virtual gArrayRetrieverImplementation<gScore> * clone() const; public: PWMScoreRetriever(const gMatrix<gScore> & mat,double threshold); virtual ~PWMScoreRetriever(); };
PWMScoreRetriever::PWMScoreRetriever(const gMatrix<gScore> & mat,double threshold):gFeatureRetrieverImplementation<gScore>(0,mat.getColsNum()){ i_mat=mat; i_threshold=threshold; i_positions=getArray<gPos>(0,mat.getColsNum()-1,1); }
PWMScoreRetriever::~PWMScoreRetriever() { } gArrayRetrieverImplementation<gScore> * PWMScoreRetriever::clone() const { return new PWMScoreRetriever(i_mat,i_threshold); }
A | -> | i_mat(0,0) |
C | -> | i_mat(1,1) |
G | -> | i_mat(2,2) |
T | -> | i_mat(3,3) |
G | -> | i_mat(2,4) |
T | -> | i_mat(3,5) |
G | -> | i_mat(2,6) |
// here we construct the "ast" array containing row positions 0,1,2,3 corresponding to A,C,G T or a,c,g,t ASCII codes // this is global and static static const gPos char_map[256]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; static const gPos char_map_NA[248]={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,66,68,69,70,72,73,74,75,76,77,78,79,80,81,82,83,85,86,87,88,89,90,91,92,93,94,95,96,98,100,101,102,104,105,106,107,108,109,110,111,112,113,114,115,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255}; static const gArray<gPos> ast(char_map,256,gArray<gPos>(char_map_NA,248)); bool PWMScoreRetriever::calculateFeature(gArray<gScore> & values,const gElement & element,const gElementInterval & interval) const { //get the sequence in the required interval gSequence seq=element.getSequence(interval); //build the whole sequence row indices array gArray<gPos> iseq=ast[seq]; //the number of positions gSize mlength=i_mat.getColsNum(); //pre-allocate (to save time) the array that will contain the score values for each matrix column position gArray<gScore> vals(0,mlength,false,false); //pre-allocate (to save time) the single value array which will contain the (possibly NA) sum of the columns values. gArray<gScore> sum(0,1,false,false); //cycle through sequence positions for (gPos i=0;i<seq.getLength()-mlength;i++) { //get the sum of columns score for the current sub-sequence i_mat(vals,gArray<gPos>(iseq,i,i+mlength),i_positions).getSum(sum); if(!sum.isNA(0)){ // if the sum is not NA (please note that sum can be NA if the iseq // sub-interval contais some NA value: for example if the sequence // contais some character different from a,c,g or t) //set the value at position i if (sum[0]>i_threshold) values.setValue(i,sum[0],false); //to the score if it's above threshold else values.setValue(i,0,false); //to 0 otherwise } //we don't need to set the value to NA here, since values come already //instantiated with anything set to NA } return true; }
// SF2/ASF static const double valEse1[28]={ -1.139660, 0.621677, -1.584962, 1.323616, -1.584962, -1.584962, 0.621677, 1.365525,-1.103778, 0.725053, 0.330068, 0.938765, -1.584962, -1.584962, -0.211782, 0.174455, 0.478823,-1.584962, 0.334650, 0.992775, -0.105105, -1.584962,-0.500847, -1.584962,-1.127254, -1.584962, -1.127254, 0.268437 }; const double threshold_ese1 = 1.956; // SRp40 static const double valEse2[28]={ -0.130243, -1.584962, 1.276539, -0.326247, 0.969599, -0.130243, -1.584962, 0.562681, 0.681752, -1.119013, 1.242778, -0.767378, 0.131965, -0.045863, -1.584962, -0.139795, -1.332258, -0.481546, -1.584962, 0.437586, 0.796774, 0.919376, 0.369696, 0.229264, -1.141854, 0.723873, -3.169924, -0.130243 }; const double threshold_ese2 = 2.670; // SRp55 static const double valEse3[24]={ -0.662469, 0.110350, -0.662469, 0.110350, -1.584962, 0.610874, 0.393557, -1.584962, 1.477196, -1.584962, -1.584962, 0.976113, -1.584962, 0.719010, -1.584962, 0.719010, 0.206371, -0.787198, 1.224306, -1.584962, -0.074500, -1.584962, 1.020243, -1.584962 }; const double threshold_ese3 = 2.676; // SRp40 static const double valEse4[32]={ -0.882950, 0.090266, -0.058735, -1.584962, 0.090266, -0.412767, -0.058735, 0.225310, -1.159392, -1.584962, 0.951947, 1.113179, 0.562681, 0.864029, 0.319936, -1.584962, 0.869122, 0.451118, -1.355627, -1.584962, -0.334998, -0.051280, -1.355627, 0.675206, -1.180520, -0.196560, 0.383015, 0.882338, -0.196560, -0.864986, 0.964376, -1.584962 }; const double threshold_ese4 = 2.383;
int main(int argc, char **argv) { // Declare variables to hold command line parameters gString transcriptId,vcfFile; // Command line parmater parsing if(argc==3){ //If the correct number of parameters have been specified //read the transcript ID transcriptId=argv[1]; //and the vcf file name vcfFile=argv[2]; }else{ //if the number of parameters were not correct //signal the error condition and exit throw gException("Usage: varese transcriptId vcf_file"); } // Instantiate a sequence retriever // (please consider that we have not parametrized this, although it could be done) gSequenceRetriever retriever(gLocalTxtSequenceRetriever("/adat/database/hg18/",".txt")); // UCSC MySQL server connection information // (this could be parametrized, too) dbInfo info; info.dbname="hg18"; info.host="genome-mysql.cse.ucsc.edu"; info.user="genome"; info.passwd=""; info.port=0; // Get a gTranscript object matching the ID gTranscript refTtranscript=getUCSCRefseq(transcriptId,info,retriever,gRef,0,0); // Now we construct a retriever for ESE1 PWMScoreRetriever ese1Retriever(gMatrix<gScore>(4,7,valEse1),threshold_ese1); // and add to the transcript a corresponding feature refTtranscript.addFeature<gScore>("ese1",ese1Retriever); //this could be done with a single line like we do for the rest of the ESEs... refTtranscript.addFeature<gScore>("ese2",PWMScoreRetriever(gMatrix<gScore>(4,7,valEse2),threshold_ese2)); refTtranscript.addFeature<gScore>("ese3",PWMScoreRetriever(gMatrix<gScore>(4,6,valEse3),threshold_ese3)); refTtranscript.addFeature<gScore>("ese1",PWMScoreRetriever(gMatrix<gScore>(4,8,valEse4),threshold_ese4)); // To count the number of ESEs in the reference transcript exons // we instatinate a matrix with n# exons rows and 4 (ESE) columns gMatrix<int> ref_ese_counts(refTtranscript.getExonCount(),4); // then for each exon for (gArrayIndex e=0;e<refTtranscript.getExonCount();e++) { // and for each ese type for (gArrayIndex m=0;m<4;m++) { // we get the exon interval... gElementInterval ref_eintv=refTtranscript.getExonInterval(e); //the ese scores in the exon interval... gArray<gScore> ref_ese=refTtranscript.getFeature<gScore>(m,ref_eintv); // the positions relative to the exon interval of the non-zero scores... gArray<gPos> ref_ese_pos= which(ref_ese > 0); // and finally we save their value in our matrix ref_ese_counts.setValue(e,m,ref_ese_pos.getSize(),false); } } // Now we extract haplotype information from the vcf file gVcf haplotypes(vcfFile,refTtranscript); // We declare an auxiliary variable to store the number of ESE int var_ese_count; // For each subject for (size_t i=0; i<haplotypes.sub_names.size();i++) { // Output the subject's ID cout << haplotypes.sub_names[i] << endl; cout << "-------------------" << endl; // For the first haplotype cout << "H1" << endl; // we contruct the "varied" gTranscript: it will inherit all the refTranscript features // which will be automatically recalculated where needed (i.e. where the variations occur) gTranscript v1(refTtranscript,haplotypes.v1[i]); // Then, using the varied transcript, for each exon and ESE type we do the same we've done // before for the reference transcript (this time, we write the code in an equivalent but more // compact form) for (gArrayIndex e=0;e<v1.getExonCount();e++) { for (gArrayIndex m=0;m<4;m++) { // Get the ESE count for the H1 transcript // (note that we do not have to bother about indels effect on exon positions: // this is automatically handled by the gElement object) var_ese_count=which(v1.getFeature<gScore>(m,v1.getExonInterval(e))>0).getSize(); // if the count differs from the reference... if (var_ese_count!=ref_ese_counts(e,m)) { // output the curren exon number cout << "\tE" << e+1 << " "; // the difference in ESE count / the reference ESE count cout << var_ese_count-ref_ese_counts(e,m) << "/" << ref_ese_counts(e,m); // the ESE type cout << " ESE" << m+1 << endl; } } } // Same for the other haplotype cout << "H2" << endl; gTranscript v2(refTtranscript,haplotypes.v2[i]); for (gArrayIndex e=0;e<v2.getExonCount();e++) { for (gArrayIndex m=0;m<4;m++) { var_ese_count=which(v2.getFeature<gScore>(m,v2.getExonInterval(e))>0).getSize(); if (var_ese_count!=ref_ese_counts(e,m)) { cout << "\tE" << e+1 << " "; cout << var_ese_count-ref_ese_counts(e,m) << "/" << ref_ese_counts(e,m); cout << " ESE" << m+1 << endl; } } } cout << "-------------------" << endl << endl; } //We're done return 0; }
Task | C++ | R |
Vcf parsing: | 67 | 77 |
Sequence retrieving: | 34 | 2 |
Transcript annotation retrieving: | 64 | 7 |
ESE score calculation: | 40 | 35 |
Static data (i.e. ESE matrices): | 28 | 19 |
Main program: | 70 | 141 |
Total: | 303 | 281 |
#include <cstdlib> #include <iostream> #include <geco.h> using namespace std; using namespace geco; int main(int argc, char **argv) { bool dummy; //Parameters parsing if(argc==5){ srand(1); unsigned len=atol(argv[1]); float dup=atof(argv[2]); float frag=atof(argv[3]); int mode=atol(argv[4]); unsigned wlen,nwin; //calcuate number and length of sub-arrays (windows) if(dup==0){ nwin=0; wlen=0; }else{ nwin=max((unsigned) ((float) len*dup*frag),(unsigned) 1); wlen=(float) len * dup / (float) nwin; } //for mode 1 if(mode==1){ //Instantiate and populate the array vector<unsigned> array(len,0); for(unsigned i=0;i<len;i++){ unsigned val=rand() % 1000 +1; array[i]=val; } // declare a vector to hold copies vector< vector<unsigned> > sub_arrays; // instantiate the sub-arrays for(size_t i=0;i<nwin;i++){ sub_arrays.push_back(vector<unsigned>(array.begin()+i,array.begin()+i+wlen)); } }else if(mode==2){ vector<unsigned> array(len,0); vector<bool> Nas(len,0); for(unsigned i=0;i<len;i++){ unsigned val=rand() % 1000 +1; bool isNA= (rand() % 1000)<5; array[i]=val; Nas[i]=isNA; } vector< vector<unsigned> > sub_arrays; vector< vector<bool> > sub_nas; for(size_t i=0;i<nwin;i++){ sub_arrays.push_back(vector<unsigned>(array.begin()+i,array.begin()+i+wlen)); sub_nas.push_back(vector<bool>(Nas.begin()+i,Nas.begin()+i+wlen)); } }else if(mode==3){ gArray<unsigned> array(0,len,false); for(unsigned i=0;i<len;i++){ unsigned val=rand() % 1000 +1; bool isNA = (rand() % 1000)<5; array.setValue(i,val,isNA); } vector< gArray<unsigned> > sub_arrays; for(gPos i=0;i<nwin;i++){ sub_arrays.push_back(gArray<unsigned>(array,i,i+wlen)); } } return 0; }else{ cout << "Usage: window_test len %win %dup mode" << endl; return 1; } }