Variation Toolkit

Variation Toolkit(beta version)

Tools for processing Variant Call Format (VCF) files , TSV, etc...

Pierre Lindenbaum PhDUMR915 Institut du thoraxNantes, France@yokofakunhttp://[email protected]

http://code.google.com/p/variationtoolkit/

http://www.flickr.com/photos/ohm17/162622755

http://code.google.com/p/knime4bio/

Command line

Command Line

http://commons.wikimedia.org/wiki/File:Plastic_tubing.jpg

scanvcf

$ head -n3 input.txt

#SampleVCFSample1data/sample1.vcf.gzSample2data/sample1.vcf.gzSample2data/sample1.vcf.gz

$ scanvcf < input.txt

#CHROM POS ID REF ALT QUAL FILTER . FORMAT Call SAMPLE1 879317 rs7523549 C T 71 0 . GT:GQ:DP:FLT 0/1:34:8:0 Sample11 880238 rs3748592 A G 51 0 . GT:GQ:DP:FLT 1/1:51:8:0 Sample11 880390 rs3748593 C A 99 0 . GT:GQ:DP:FLT 1/0:99:30:0 Sample11 881627 rs2272757 G A 99 0 . GT:GQ:DP:FLT 1/0:59:20:0 Sample1(...)Y 13524507 . C T 99 0 . GT:GQ:DP:FLT 1/1:99:233:0 Sample20Y 21154323 rs10465459 G A 99 0 . GT:GQ:DP:FLT 1/1:99:215:0 Sample20Y 21154426 rs52812045 G A 99 0 . GT:GQ:DP:FLT 1/0:99:143:0 Sample20Y 21154466 rs10465460 T A 99 0 . GT:GQ:DP:FLT 1/1:99:134:0 Sample20Y 21154529 . G A 51 0 . GT:GQ:DP:FLT 1/1:51:8:0 Sample20

extractinfo

$ gunzip -c data.vcf.gz |\ extractinfo -t GN -i | \ awk -F ' ' '($11 =="NOTCH2")' |\ cut -d ' ' -f 3 | grep rs

rs6685892rs2493392rs2493420rs7534585rs7534586rs2493409rs2453040rs2124109

vcfintersect

$ echo -e "#CHROM\tPOS\nchr1\t10519"|\vcfintersect -n NO\_MATCH -c2 2 -s2 3 -e2 4 \-f "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp132.txt.gz" |\verticalize -n

>>>1$1 #CHROM$2 POS>2$1 chr1$2 10519$3 585$4 chr1$5 10518$6 10519$7 rs62636508$8 0$9 +$10 G$11 G$12 C/G$13 genomic$14 single$15 by-1000genomes$16 0$17 0$18 unknown$19 exact$20 1$21 $22 2$23 1000GENOMES,BCMHGSC_JDW,$24 2$25 G,C,$26 112.000000,8.000000,$27 0.933333,0.066667,$28 >3$1#CHROM chr1$2POS 12190$3REF A$4ALT G$5knownGene.name uc010nxq.1$6knownGene.strand +$7knownGene.txStart 11873$8knownGene.txEnd 14409$9knownGene.cdsStart 12189$10knownGene.cdsEnd 13639$11prediction.type EXON|EXON_CODING_NON_SYNONYMOUS$12prediction.pos_in_cdna 0$13prediction.pos_in_protein1$14prediction.exon Exon 1$15prediction.intron .$16prediction.wild.codon ATG$17prediction.mut.codon GTG$18prediction.wild.aa M$19prediction.mut.aa V$20prediction.wild.prot MSESINFSHNLGQLLSPPRCVVMPGMPFPSIRSPELQKTTADLDHTLVSV$21prediction.mut.prot VSESINFSHNLGQLLSPPRCVVMPGMPFPSIRSPELQKTTADLDHTLVSV$22prediction.wild.rna ATGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCAGCTCCTGTCTCC$23prediction.mut.rna GTGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCAGCTCCTGTCTCC$24prediction.splicing .ref:3

1 11 21 31 41 51 61 AGCATGTTAGATAA****GATA**GCTGTGCTAGTAGGCAG*TCAGCGCCATNNNNNNNNNNNNNNNNNNNNNNNNNNNN ........ .... ......K.K......K. .......... ........AGAG....***... ,,,,, ,,,,,,,,, ......GG**....AA ..C...**** ...**...>>>>>>>>>>>>>>T.....

>ref2:2

1 11 21 31 41 51 61 aggttttataaaac****aattaagtctacagagcaactacgcgNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN.............Y ..W................... ..............****..A... .............****..A...T. .........AAAT............. C...T****.................... ..T****..................... T****......................

vcftabix

$ gunzip -c data.vcf.gz |\ grep -v "##" | normalizechrom -E |\ vcftabix -f 1000G.vcf.gz -m 1 |\ awk '($8=".")' | awk '($18=".")'

#CHROM POS ID REF ALT QUAL FILTER . FORMAT Call #CHROM POS ID REF ALT QUAL FILTER .1 879317 rs7523549 C T 71 0 . GT:GQ:DP:FLT 0/1:34:8:0 1 879317 rs7523549 C T . PASS .1 880238 rs3748592 A G 51 0 . GT:GQ:DP:FLT 1/1:51:8:0 1 880238 rs3748592 A G . PASS .1 880390 rs3748593 C A 99 0 . GT:GQ:DP:FLT 1/0:99:30:0 1 880390 rs3748593 C A . PASS .1 881627 rs2272757 G A 99 0 . GT:GQ:DP:FLT 1/0:59:20:0 1 881627 rs2272757 G A . PASS .1 883625 rs4970378 A G 39 0 . GT:GQ:DP:FLT 1/1:39:4:0 1 883625 rs4970378 A G . PASS .1 887560 rs3748595 A C 99 0 . GT:GQ:DP:FLT 1/1:99:40:0 1 887560 rs3748595 A C . PASS .1 887801 rs3828047 A G 99 0 . GT:GQ:DP:FLT 1/1:99:32:0 1 887801 rs3828047 A G . PASS .1 888639 rs3748596 T C 99 0 . GT:GQ:DP:FLT 1/1:99:32:0 1 888639 rs3748596 T C . PASS .1 888659 rs3748597 T C 99 0 . GT:GQ:DP:FLT 1/1:99:26:0 1 888659 rs3748597 T C . PASS .(...)

mysqlquery

$ echo -e "#Gene\nuc001aaa.3\nHello\nuc001aac.3" |\ mysqlquery --host localhost --user anonymous --port 3316 \ -q 'select mRNA,description from kgXref where kgId="$1"' |\ verticalize

>>>2$1#Gene uc001aaa.3$2mRNA BC032353$3descriptionHomo sapiens mRNA for DEAD/H box polypeptide 11 like 1 (DDX11L1 gene).>3$1#Gene Hello$2mRNA .$3description.>4$1#Gene uc001aac.3$2mRNA BC063459$3descriptionHomo sapiens cDNA FLJ31670 fis, clone NT2RI2004984.3$1#CHROM 1(...)

uniprot

$ echo -e "#POS\tID\n54\tQ04721\n166\tP03536" |\ uniprot -p 1 -s 2 |\ verticalize

>>>2$1#POS 54$2ID Q04721$3uniprot.beg 26$4uniprot.end 2471$5uniprot.type chain$6uniprot.status .$7uniprot.desc Neurogenic locus notch homolog protein 2$8uniprot.evidence.$9uniprot.ref .>3$1#POS 54$2ID Q04721$3uniprot.beg 26$4uniprot.end 1677$5uniprot.type topological domain$6uniprot.status potential$7uniprot.desc Extracellular$8uniprot.evidence.$9uniprot.ref .>2$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor...>3$1#Gene EIF4G1$2stringId 9606.ENSP00000316879$3preferredNameEIF4G1$4annotation eukaryotic translation initiation factor 4 gamma, 1; ...>4$1#Gene PABPC1$2stringId 9606.ENSP00000313007$3preferredNamePABPC1$4annotation poly(A) binding protein, cytoplasmic 1; Binds the...>2$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000256646$6stringId 9606.ENSP00000256646$7preferredNameNOTCH2$8annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr>3$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000345206$6stringId 9606.ENSP00000345206$7preferredNameRBPJ$8annotation recombination signal binding protein for immunoglobulin kappa J>4$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000355718$6stringId 9606.ENSP00000355718$7preferredNameDLL1

emblstringinteractions

$ echo -e "#Gene\nNOTCH2\nEIF4G1\nPABPC1" |\ emblstringresolve -c 1 | \ emblstringinteractors -c 2 | \ emblstringresolve -c 5 | \ verticalize >>>2$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000256646$6stringId 9606.ENSP00000256646$7preferredNameNOTCH2$8annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr>3$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000345206$6stringId 9606.ENSP00000345206$7preferredNameRBPJ$8annotation recombination signal binding protein for immunoglobulin kappa J>4$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000355718$6stringId 9606.ENSP00000355718$7preferredNameDLL1

vcfcut

$ curl -s "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.sites.vcf.gz" |\ gunzip -c | grep -v "##" |\ vcfcut -e '2:10kb+500bp; 1:10000-20000' #CHROMPOSIDREFALTQUALFILTERINFO110327rs112750067TC.PASSDP=65;AF=0.208;CB=BC,NCBI110469rs117577454CG.PASSDP=2055;AF=0.020;CB=UM,BC,NCBI110492rs55998931CT.PASSDP=231;AF=0.167;CB=BC,NCBI(...)116841.GT.PASSDP=2906;AF=0.004;CB=UM,BI;EUR_R2=0.248210038.CA.PASSDP=73;AF=0.409;CB=BC,NCBI210075.CA.PASSDP=31;AF=0.150;CB=BC,NCBI210144.CA.PASSDP=33;AF=0.562;CB=BC,NCBI210159.CA.PASSDP=32;AF=0.222;CB=BC,NCBI210205.TG.PASSDP=582;AF=0.107;CB=UM,BC210297.GT.PASSDP=500;AF=0.246;CB=UM,BC210363.GA.PASSDP=788;AF=0.016;CB=UM,BI;EUR_R2=0...

ucscgenesps

$ cat sample2vcf.tsv | tr -d ' ' |\ scanvcf |\ awk -F ' ' '($3==".")' |\ normalizechrom |\ sort -t ' ' -k1,1 -k2,2n -k11,11 |\ head -n 10000 |\ ucscgenesps --host localhost --user username --port 3316 -s 11 \ > result.ps

igvcontrol

vcfliftover

$ curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp129.txt.gz" |\ gunzip -c |\ cut -d ' ' -f 2,3,5 |\ vcfliftover -1 -f /path/tp/hg18ToHg19.over.chain chr1433rs56289060chr11043310434.chr1491rs55998931chr11049110492.chr1518rs62636508chr11051810519.chr1582rs58108140chr11058210583.chr1690rs10218492chr11082710828.chr1766rs10218493chr11090310904.chr1789rs10218527chr11092610927.chr1800rs28853987chr11093710938.chr1876rs28484712chr11101311014.chr1884rs28775022chr11102111022.(...)chr11609710rs61776794...Deleted in newchr11609743rs61776795...Deleted in newchr11609758rs61776796...Deleted in newchr11609849rs7413891...Deleted in newchr11610719rs3737622...Deleted in newchr11610719rs45576038...Deleted in newchr11610763rs3737624...Deleted in newchr12475133rs3091278...Deleted in newchr12475134rs3091239...Deleted in new(...)

backlocate

echo -e "NOTCH2\tM1T\nEIF4G1\tD240Y" |\ backlocate -f /path/to/hg19.fa

#User.GeneAA1petide.pos.1AA2knownGene.nameknownGene.strandknownGene.AAindex0.in.rnacodonbase.in.rnachromosomeindex0.in.genomicexon##uc001eik.2NOTCH2M1Tuc001eik.2-M0ATGAchr1120612019Exon 1NOTCH2M1Tuc001eik.2-M1ATGTchr1120612018Exon 1NOTCH2M1Tuc001eik.2-M2ATGGchr1120612017Exon 1##uc001eil.2NOTCH2M1Tuc001eil.2-M0ATGAchr1120612019Exon 1NOTCH2M1Tuc001eil.2-M1ATGTchr1120612018Exon 1NOTCH2M1Tuc001eil.2-M2ATGGchr1120612017Exon 1##uc001eim.3NOTCH2M1Tuc001eim.3-M0ATGAchr1120548116Exon 2NOTCH2M1Tuc001eim.3-M1ATGTchr1120548115Exon 2NOTCH2M1Tuc001eim.3-M2ATGGchr1120548114Exon 2##Warning ref aminod acid for uc003fnp.2 [240] is not the same (I/D)EIF4G1D240Yuc003fnp.2+I717ATCAchr3184039089Exon 10EIF4G1D240Yuc003fnp.2+I718ATCTchr3184039090Exon 10EIF4G1D240Yuc003fnp.2+I719ATCCchr3184039091Exon 10##Warning ref aminod acid for uc003fnu.3 [240] is not the same (I/D)EIF4G1D240Yuc003fnu.3+I717ATCAchr3184039089Exon 9EIF4G1D240Yuc003fnu.3+I718ATCTchr3184039090Exon 9EIF4G1D240Yuc003fnu.3+I719ATCCchr3184039091Exon 9##Warning ref aminod acid for uc003fnq.2 [240] is not the same (V/D)EIF4G1D240Yuc003fnq.2+V717GTAGchr3184039350Exon 7EIF4G1D240Yuc003fnq.2+V718GTATchr3184039351Exon 7EIF4G1D240Yuc003fnq.2+V719GTAAchr3184039352Exon 7##Warning ref aminod acid for uc003fnr.2 [240] is not the same (L/D)EIF4G1D240Yuc003fnr.2+L717CTCCchr3184039581Exon 6EIF4G1D240Yuc003fnr.2+L718CTCTchr3184039582Exon 6EIF4G1D240Yuc003fnr.2+L719CTCCchr3184039583Exon 6##Warning ref aminod acid for uc003fny.3 [240] is not the same (T/D)EIF4G1D240Yuc003fny.3+T717ACCAchr3184039677Exon 3EIF4G1D240Yuc003fny.3+T718ACCCchr3184039678Exon 3EIF4G1D240Yuc003fny.3+T719ACCCchr3184039679Exon 3##uc010hxx.2EIF4G1D240Yuc010hxx.2+D717GATGchr3184038780Exon 10EIF4G1D240Yuc010hxx.2+D718GATAchr3184039069Exon 11EIF4G1D240Yuc010hxx.2+D719GATTchr3184039070Exon 11##Warning ref aminod acid for uc003fns.2 [240] is not the same (L/D)EIF4G1D240Yuc003fns.2+L717CTCCchr3184039209Exon 10EIF4G1D240Yuc003fns.2+L718CTCTchr3184039210Exon 10EIF4G1D240Yuc003fns.2+L719CTCCchr3184039211Exon 10

Thank you

Klicken Sie, um das Format des Titeltextes zu bearbeiten

Klicken Sie, um die Formate des Gliederungstextes zu bearbeitenZweite GliederungsebeneDritte GliederungsebeneVierte GliederungsebeneFnfte GliederungsebeneSechste GliederungsebeneSiebente GliederungsebeneAchte GliederungsebeneNeunte Gliederungsebene

Health & Medicine

Variation Toolkit