Variation Toolkit

Embed Size (px)

Citation preview

Variation Toolkit(beta version)

Tools for processing Variant Call Format (VCF) files , TSV, etc...

Pierre Lindenbaum PhDUMR915 Institut du thoraxNantes, France@yokofakunhttp://[email protected]

http://code.google.com/p/variationtoolkit/

http://www.flickr.com/photos/ohm17/162622755

http://code.google.com/p/knime4bio/

Command line

Command Line

http://commons.wikimedia.org/wiki/File:Plastic_tubing.jpg

scanvcf

$ head -n3 input.txt

#SampleVCFSample1data/sample1.vcf.gzSample2data/sample1.vcf.gzSample2data/sample1.vcf.gz

$ scanvcf < input.txt

#CHROM POS ID REF ALT QUAL FILTER . FORMAT Call SAMPLE1 879317 rs7523549 C T 71 0 . GT:GQ:DP:FLT 0/1:34:8:0 Sample11 880238 rs3748592 A G 51 0 . GT:GQ:DP:FLT 1/1:51:8:0 Sample11 880390 rs3748593 C A 99 0 . GT:GQ:DP:FLT 1/0:99:30:0 Sample11 881627 rs2272757 G A 99 0 . GT:GQ:DP:FLT 1/0:59:20:0 Sample1(...)Y 13524507 . C T 99 0 . GT:GQ:DP:FLT 1/1:99:233:0 Sample20Y 21154323 rs10465459 G A 99 0 . GT:GQ:DP:FLT 1/1:99:215:0 Sample20Y 21154426 rs52812045 G A 99 0 . GT:GQ:DP:FLT 1/0:99:143:0 Sample20Y 21154466 rs10465460 T A 99 0 . GT:GQ:DP:FLT 1/1:99:134:0 Sample20Y 21154529 . G A 51 0 . GT:GQ:DP:FLT 1/1:51:8:0 Sample20

extractinfo

$ gunzip -c data.vcf.gz |\ extractinfo -t GN -i | \ awk -F ' ' '($11 =="NOTCH2")' |\ cut -d ' ' -f 3 | grep rs

rs6685892rs2493392rs2493420rs7534585rs7534586rs2493409rs2453040rs2124109

vcfintersect

$ echo -e "#CHROM\tPOS\nchr1\t10519"|\vcfintersect -n NO\_MATCH -c2 2 -s2 3 -e2 4 \-f "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp132.txt.gz" |\verticalize -n

>>>1$1 #CHROM$2 POS>2$1 chr1$2 10519$3 585$4 chr1$5 10518$6 10519$7 rs62636508$8 0$9 +$10 G$11 G$12 C/G$13 genomic$14 single$15 by-1000genomes$16 0$17 0$18 unknown$19 exact$20 1$21 $22 2$23 1000GENOMES,BCMHGSC_JDW,$24 2$25 G,C,$26 112.000000,8.000000,$27 0.933333,0.066667,$28 >3$1#CHROM chr1$2POS 12190$3REF A$4ALT G$5knownGene.name uc010nxq.1$6knownGene.strand +$7knownGene.txStart 11873$8knownGene.txEnd 14409$9knownGene.cdsStart 12189$10knownGene.cdsEnd 13639$11prediction.type EXON|EXON_CODING_NON_SYNONYMOUS$12prediction.pos_in_cdna 0$13prediction.pos_in_protein1$14prediction.exon Exon 1$15prediction.intron .$16prediction.wild.codon ATG$17prediction.mut.codon GTG$18prediction.wild.aa M$19prediction.mut.aa V$20prediction.wild.prot MSESINFSHNLGQLLSPPRCVVMPGMPFPSIRSPELQKTTADLDHTLVSV$21prediction.mut.prot VSESINFSHNLGQLLSPPRCVVMPGMPFPSIRSPELQKTTADLDHTLVSV$22prediction.wild.rna ATGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCAGCTCCTGTCTCC$23prediction.mut.rna GTGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCAGCTCCTGTCTCC$24prediction.splicing .ref:3

1 11 21 31 41 51 61 AGCATGTTAGATAA****GATA**GCTGTGCTAGTAGGCAG*TCAGCGCCATNNNNNNNNNNNNNNNNNNNNNNNNNNNN ........ .... ......K.K......K. .......... ........AGAG....***... ,,,,, ,,,,,,,,, ......GG**....AA ..C...**** ...**...>>>>>>>>>>>>>>T.....

>ref2:2

1 11 21 31 41 51 61 aggttttataaaac****aattaagtctacagagcaactacgcgNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN.............Y ..W................... ..............****..A... .............****..A...T. .........AAAT............. C...T****.................... ..T****..................... T****......................

vcftabix

$ gunzip -c data.vcf.gz |\ grep -v "##" | normalizechrom -E |\ vcftabix -f 1000G.vcf.gz -m 1 |\ awk '($8=".")' | awk '($18=".")'

#CHROM POS ID REF ALT QUAL FILTER . FORMAT Call #CHROM POS ID REF ALT QUAL FILTER .1 879317 rs7523549 C T 71 0 . GT:GQ:DP:FLT 0/1:34:8:0 1 879317 rs7523549 C T . PASS .1 880238 rs3748592 A G 51 0 . GT:GQ:DP:FLT 1/1:51:8:0 1 880238 rs3748592 A G . PASS .1 880390 rs3748593 C A 99 0 . GT:GQ:DP:FLT 1/0:99:30:0 1 880390 rs3748593 C A . PASS .1 881627 rs2272757 G A 99 0 . GT:GQ:DP:FLT 1/0:59:20:0 1 881627 rs2272757 G A . PASS .1 883625 rs4970378 A G 39 0 . GT:GQ:DP:FLT 1/1:39:4:0 1 883625 rs4970378 A G . PASS .1 887560 rs3748595 A C 99 0 . GT:GQ:DP:FLT 1/1:99:40:0 1 887560 rs3748595 A C . PASS .1 887801 rs3828047 A G 99 0 . GT:GQ:DP:FLT 1/1:99:32:0 1 887801 rs3828047 A G . PASS .1 888639 rs3748596 T C 99 0 . GT:GQ:DP:FLT 1/1:99:32:0 1 888639 rs3748596 T C . PASS .1 888659 rs3748597 T C 99 0 . GT:GQ:DP:FLT 1/1:99:26:0 1 888659 rs3748597 T C . PASS .(...)

mysqlquery

$ echo -e "#Gene\nuc001aaa.3\nHello\nuc001aac.3" |\ mysqlquery --host localhost --user anonymous --port 3316 \ -q 'select mRNA,description from kgXref where kgId="$1"' |\ verticalize

>>>2$1#Gene uc001aaa.3$2mRNA BC032353$3descriptionHomo sapiens mRNA for DEAD/H box polypeptide 11 like 1 (DDX11L1 gene).>3$1#Gene Hello$2mRNA .$3description.>4$1#Gene uc001aac.3$2mRNA BC063459$3descriptionHomo sapiens cDNA FLJ31670 fis, clone NT2RI2004984.3$1#CHROM 1(...)

uniprot

$ echo -e "#POS\tID\n54\tQ04721\n166\tP03536" |\ uniprot -p 1 -s 2 |\ verticalize

>>>2$1#POS 54$2ID Q04721$3uniprot.beg 26$4uniprot.end 2471$5uniprot.type chain$6uniprot.status .$7uniprot.desc Neurogenic locus notch homolog protein 2$8uniprot.evidence.$9uniprot.ref .>3$1#POS 54$2ID Q04721$3uniprot.beg 26$4uniprot.end 1677$5uniprot.type topological domain$6uniprot.status potential$7uniprot.desc Extracellular$8uniprot.evidence.$9uniprot.ref .>2$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor...>3$1#Gene EIF4G1$2stringId 9606.ENSP00000316879$3preferredNameEIF4G1$4annotation eukaryotic translation initiation factor 4 gamma, 1; ...>4$1#Gene PABPC1$2stringId 9606.ENSP00000313007$3preferredNamePABPC1$4annotation poly(A) binding protein, cytoplasmic 1; Binds the...>2$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000256646$6stringId 9606.ENSP00000256646$7preferredNameNOTCH2$8annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr>3$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000345206$6stringId 9606.ENSP00000345206$7preferredNameRBPJ$8annotation recombination signal binding protein for immunoglobulin kappa J>4$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000355718$6stringId 9606.ENSP00000355718$7preferredNameDLL1

emblstringinteractions

$ echo -e "#Gene\nNOTCH2\nEIF4G1\nPABPC1" |\ emblstringresolve -c 1 | \ emblstringinteractors -c 2 | \ emblstringresolve -c 5 | \ verticalize >>>2$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000256646$6stringId 9606.ENSP00000256646$7preferredNameNOTCH2$8annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr>3$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000345206$6stringId 9606.ENSP00000345206$7preferredNameRBPJ$8annotation recombination signal binding protein for immunoglobulin kappa J>4$1#Gene NOTCH2$2stringId 9606.ENSP00000256646$3preferredNameNOTCH2$4annotation Notch homolog 2 (Drosophila); Functions as a receptor for membr$5interactor 9606.ENSP00000355718$6stringId 9606.ENSP00000355718$7preferredNameDLL1

vcfcut

$ curl -s "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.sites.vcf.gz" |\ gunzip -c | grep -v "##" |\ vcfcut -e '2:10kb+500bp; 1:10000-20000' #CHROMPOSIDREFALTQUALFILTERINFO110327rs112750067TC.PASSDP=65;AF=0.208;CB=BC,NCBI110469rs117577454CG.PASSDP=2055;AF=0.020;CB=UM,BC,NCBI110492rs55998931CT.PASSDP=231;AF=0.167;CB=BC,NCBI(...)116841.GT.PASSDP=2906;AF=0.004;CB=UM,BI;EUR_R2=0.248210038.CA.PASSDP=73;AF=0.409;CB=BC,NCBI210075.CA.PASSDP=31;AF=0.150;CB=BC,NCBI210144.CA.PASSDP=33;AF=0.562;CB=BC,NCBI210159.CA.PASSDP=32;AF=0.222;CB=BC,NCBI210205.TG.PASSDP=582;AF=0.107;CB=UM,BC210297.GT.PASSDP=500;AF=0.246;CB=UM,BC210363.GA.PASSDP=788;AF=0.016;CB=UM,BI;EUR_R2=0...

ucscgenesps

$ cat sample2vcf.tsv | tr -d ' ' |\ scanvcf |\ awk -F ' ' '($3==".")' |\ normalizechrom |\ sort -t ' ' -k1,1 -k2,2n -k11,11 |\ head -n 10000 |\ ucscgenesps --host localhost --user username --port 3316 -s 11 \ > result.ps

igvcontrol

vcfliftover

$ curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp129.txt.gz" |\ gunzip -c |\ cut -d ' ' -f 2,3,5 |\ vcfliftover -1 -f /path/tp/hg18ToHg19.over.chain chr1433rs56289060chr11043310434.chr1491rs55998931chr11049110492.chr1518rs62636508chr11051810519.chr1582rs58108140chr11058210583.chr1690rs10218492chr11082710828.chr1766rs10218493chr11090310904.chr1789rs10218527chr11092610927.chr1800rs28853987chr11093710938.chr1876rs28484712chr11101311014.chr1884rs28775022chr11102111022.(...)chr11609710rs61776794...Deleted in newchr11609743rs61776795...Deleted in newchr11609758rs61776796...Deleted in newchr11609849rs7413891...Deleted in newchr11610719rs3737622...Deleted in newchr11610719rs45576038...Deleted in newchr11610763rs3737624...Deleted in newchr12475133rs3091278...Deleted in newchr12475134rs3091239...Deleted in new(...)

backlocate

echo -e "NOTCH2\tM1T\nEIF4G1\tD240Y" |\ backlocate -f /path/to/hg19.fa

#User.GeneAA1petide.pos.1AA2knownGene.nameknownGene.strandknownGene.AAindex0.in.rnacodonbase.in.rnachromosomeindex0.in.genomicexon##uc001eik.2NOTCH2M1Tuc001eik.2-M0ATGAchr1120612019Exon 1NOTCH2M1Tuc001eik.2-M1ATGTchr1120612018Exon 1NOTCH2M1Tuc001eik.2-M2ATGGchr1120612017Exon 1##uc001eil.2NOTCH2M1Tuc001eil.2-M0ATGAchr1120612019Exon 1NOTCH2M1Tuc001eil.2-M1ATGTchr1120612018Exon 1NOTCH2M1Tuc001eil.2-M2ATGGchr1120612017Exon 1##uc001eim.3NOTCH2M1Tuc001eim.3-M0ATGAchr1120548116Exon 2NOTCH2M1Tuc001eim.3-M1ATGTchr1120548115Exon 2NOTCH2M1Tuc001eim.3-M2ATGGchr1120548114Exon 2##Warning ref aminod acid for uc003fnp.2 [240] is not the same (I/D)EIF4G1D240Yuc003fnp.2+I717ATCAchr3184039089Exon 10EIF4G1D240Yuc003fnp.2+I718ATCTchr3184039090Exon 10EIF4G1D240Yuc003fnp.2+I719ATCCchr3184039091Exon 10##Warning ref aminod acid for uc003fnu.3 [240] is not the same (I/D)EIF4G1D240Yuc003fnu.3+I717ATCAchr3184039089Exon 9EIF4G1D240Yuc003fnu.3+I718ATCTchr3184039090Exon 9EIF4G1D240Yuc003fnu.3+I719ATCCchr3184039091Exon 9##Warning ref aminod acid for uc003fnq.2 [240] is not the same (V/D)EIF4G1D240Yuc003fnq.2+V717GTAGchr3184039350Exon 7EIF4G1D240Yuc003fnq.2+V718GTATchr3184039351Exon 7EIF4G1D240Yuc003fnq.2+V719GTAAchr3184039352Exon 7##Warning ref aminod acid for uc003fnr.2 [240] is not the same (L/D)EIF4G1D240Yuc003fnr.2+L717CTCCchr3184039581Exon 6EIF4G1D240Yuc003fnr.2+L718CTCTchr3184039582Exon 6EIF4G1D240Yuc003fnr.2+L719CTCCchr3184039583Exon 6##Warning ref aminod acid for uc003fny.3 [240] is not the same (T/D)EIF4G1D240Yuc003fny.3+T717ACCAchr3184039677Exon 3EIF4G1D240Yuc003fny.3+T718ACCCchr3184039678Exon 3EIF4G1D240Yuc003fny.3+T719ACCCchr3184039679Exon 3##uc010hxx.2EIF4G1D240Yuc010hxx.2+D717GATGchr3184038780Exon 10EIF4G1D240Yuc010hxx.2+D718GATAchr3184039069Exon 11EIF4G1D240Yuc010hxx.2+D719GATTchr3184039070Exon 11##Warning ref aminod acid for uc003fns.2 [240] is not the same (L/D)EIF4G1D240Yuc003fns.2+L717CTCCchr3184039209Exon 10EIF4G1D240Yuc003fns.2+L718CTCTchr3184039210Exon 10EIF4G1D240Yuc003fns.2+L719CTCCchr3184039211Exon 10

Thank you

Klicken Sie, um das Format des Titeltextes zu bearbeiten

Klicken Sie, um die Formate des Gliederungstextes zu bearbeitenZweite GliederungsebeneDritte GliederungsebeneVierte GliederungsebeneFnfte GliederungsebeneSechste GliederungsebeneSiebente GliederungsebeneAchte GliederungsebeneNeunte Gliederungsebene