20 November 2011

Processing json data with apache velocity.

I've written a tool named "apache velocity" which parse json data and processes it with "Apache velocity" (a template engine ). The (javacc) source code is available here:


https://github.com/lindenb/jsandbox/blob/master/src/sandbox/VelocityJson.jj

Example

Say you have defined some classes using JSON:

[
  {
    "type": "record",
    "name": "Exon",
    "fields" : [
      {"name": "start", "type": "int"},
      {"name": "end", "type": "int"}
    ]
  },
  {
    "type": "record",
    "name": "Gene",
    "fields" : [
      {"name": "chrom", "type": "string"},
      {"name": "name", "type": "string"},
      {"name": "txStart", "type": "int"},
      {"name": "txEnd", "type": "int"},
      {"name": "cdsStart", "type": "int"},
      {"name": "cdsEnd", "type": "int"},
      {"name": "exons", "type":{"type":"array","items":"Exon"}}
    ]
  } 
 ]
and here is a velocity template transforming this json structure to java :

#macro(javaName $s)$s.substring(0,1).toUpperCase()$s.substring(1)#end
#macro(setter $s)set#javaName($s)#end
#macro(getter $s)get#javaName($s)#end
#macro(javaType $f)
#if($f.type.equals("string"))
java.lang.String#elseif($f.type.equals("boolean"))
boolean#elseif($f.type.equals("long"))
long#elseif($f.type.equals("float"))
float#elseif($f.type.equals("double"))
double#elseif($f.type.equals("int"))
int#elseif($f.items)
$f.items#elseif($f.type.type.equals("array"))
java.util.List<#javaType($f.type)>#else
$f.type
#end
#end

#foreach( $class in $avro)

class $class.name
{
#foreach( $field in $class.fields )
private  #javaType($field) $field.name;
#end

public ${class.name}()
 {
 }

public ${class.name}(#foreach( $field in $class.fields )
 #if($velocityCount>1),#end#javaType($field) $field.name
 #end
 )
 {
 #foreach( $field in $class.fields )
 this.$field.name=$field.name;
 #end
 }
 


#foreach( $field in $class.fields )
public void #setter($field.name)(#javaType($field) $field.name)
 {
 this.$field.name=$field.name;
 }
public #javaType($field) #getter($field.name)()
 {
 return this.$field.name;
 }
#end
}
#end
The json file can be processed with velocity using the following command line:

$ java -jar velocityjson.jar -f avro structure.json json2java.vm

Result

class Exon
{
private  int start;
private  int end;

public Exon()
 {
 }

public Exon( int start
  ,int end
  )
 {
  this.start=start;
  this.end=end;
  }
 


public void setStart(int start)
 {
 this.start=start;
 }
public int getStart()
 {
 return this.start;
 }
public void setEnd(int end)
 {
 this.end=end;
 }
public int getEnd()
 {
 return this.end;
 }
}

class Gene
{
private  java.lang.String chrom;
private  java.lang.String name;
private  int txStart;
private  int txEnd;
private  int cdsStart;
private  int cdsEnd;
private  java.util.List<Exon> exons;

public Gene()
 {
 }

public Gene( java.lang.String chrom
  ,java.lang.String name
  ,int txStart
  ,int txEnd
  ,int cdsStart
  ,int cdsEnd
  ,java.util.List<Exon> exons
  )
 {
  this.chrom=chrom;
  this.name=name;
  this.txStart=txStart;
  this.txEnd=txEnd;
  this.cdsStart=cdsStart;
  this.cdsEnd=cdsEnd;
  this.exons=exons;
  }
 


public void setChrom(java.lang.String chrom)
 {
 this.chrom=chrom;
 }
public java.lang.String getChrom()
 {
 return this.chrom;
 }
public void setName(java.lang.String name)
 {
 this.name=name;
 }
public java.lang.String getName()
 {
 return this.name;
 }
public void setTxStart(int txStart)
 {
 this.txStart=txStart;
 }
public int getTxStart()
 {
 return this.txStart;
 }
public void setTxEnd(int txEnd)
 {
 this.txEnd=txEnd;
 }
public int getTxEnd()
 {
 return this.txEnd;
 }
public void setCdsStart(int cdsStart)
 {
 this.cdsStart=cdsStart;
 }
public int getCdsStart()
 {
 return this.cdsStart;
 }
public void setCdsEnd(int cdsEnd)
 {
 this.cdsEnd=cdsEnd;
 }
public int getCdsEnd()
 {
 return this.cdsEnd;
 }
public void setExons(java.util.List<Exon> exons)
 {
 this.exons=exons;
 }
public java.util.List<Exon> getExons()
 {
 return this.exons;
 }
}


That's it,

Pierre

16 November 2011

"VCF annotation" with the NHLBI GO Exome Sequencing Project (JAX-WS)

The NHLBI Exome Sequencing Project (ESP) has released a web service to query their data. "The goal of the NHLBI GO Exome Sequencing Project (ESP) is to discover novel genes and mechanisms contributing to heart, lung and blood disorders by pioneering the application of next-generation sequencing of the protein coding regions of the human genome across diverse, richly-phenotyped populations and to share these datasets and findings with the scientific community to extend and enrich the diagnosis, management and treatment of heart, lung and blood disorders.".
In the current post, I'll show how I've used this web service to annotate a VCF file with this information.
The web service provided by the ESP is based on the SOAP protocol.
Here is an example of the XML response: We can generate the java classes for a client invoking this Web Service by using ${JAVA_HOME}/bin/wsimport.

$ wsimport -keep "http://evs.gs.washington.edu/wsEVS/EVSDataQueryService?wsdl"

parsing WSDL...
generating code...
compiling code...

Here is the java code running this client. It scans the VCF, calls the webservice for each variation and insert the annotation as JSON in a new column .
... and the makefile:

Result (some columns have been cut)

curl -s "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/supporting/EUR.2of4intersection_allele_freq.20100804.sites.vcf.gz" |\
 gunzip -c |\
 java -jar evsclient.jar 



##fileformat=VCFv4.0
##filedat=20101112
##datarelease=20100804
##samples=629
##description="Where BI calls are present, genotypes and alleles are from BI.  In there absence, UM genotypes are used.  If neither are available, no genotype information is present and the alleles are from the NCBI calls."
(...)
#CHROM POS ID EVS
1 10469 rs117577454 {"start":10469,"chromosome":"1","stop":10470,"strand":"+","snpList":[],"setOfSiteCoverageInfo":[]}
1 10583 rs58108140 {"start":10583,"chromosome":"1","stop":10584,"strand":"+","snpList":[],"setOfSiteCoverageInfo":[]}
1 11508 . {"start":11508,"chromosome":"1","stop":11509,"strand":"
(...)
1 69511 . {"start":69511,"chromosome":"1","stop":69512,"strand":"+","snpList":[{"chromosome":"1","conservationScore":"1.0","conservationScoreGERP":"0.5","refAllele":"A","ancestralAllele":"G","filters":"PASS","clinicalLink":"unknown","positionString":"1:69511","chrPosition":69511,"alleles":"G/A","uaAlleleCounts":"1373/47","aaAlleleCounts":"880/600","totalAlleleCounts":"2253/647","uaAlleleAndCount":"G=1373/A=47","aaAlleleAndCount":"G=880/A=600","totalAlleleAndCount":"G=2253/A=647","uaMAF":3.3099,"aaMAF":40.5405,"totalMAF":22.3103,"avgSampleReadDepth":185,"geneList":"OR4F5","snpFunction":{"chromosome":"1","position":69511,"conservationScore":"1.0","conservationScoreGERP":"0.5","snpFxnList":[{"mrnaAccession":"NM_001005484","fxnClassGVS":"missense","aminoAcids":"THR,ALA","proteinPos":"141/306","cdnaPos":421,"pphPrediction":"benign","granthamScore":"58"}],"refAllele":"A","ancestralAllele":"G","firstRsId":75062661,"secondRsId":0,"filters":"PASS","clinicalLink":"unknown"},"altAlleles":"G","hasAtLeastOneAccession":"true","rsIds":"rs75062661"}],"setOfSiteCoverageInfo":[{"chromosome":"1","position":69511,"avgSampleReadDepth":185.0,"totalSamplesCovered":1452,"eaSamplesCovered":712,"avgEaSampleReadDepth":157.0,"aaSamplesCovered":740,"avgAaSampleReadDepth":211.0},{"chromosome":"1","position":69512,"avgSampleReadDepth":180.0,"totalSamplesCovered":1501,"eaSamplesCovered":739,"avgEaSampleReadDepth":153.0,"aaSamplesCovered":762,"avgAaSampleReadDepth":207.0}]}
(...)
1 901923 . {"start":901923,"chromosome":"1","stop":901924,"strand":"+","snpList":[{"chromosome":"1","conservationScore":"1.0","conservationScoreGERP":"5.0","refAllele":"C","ancestralAllele":"C","filters":"PASS","clinicalLink":"unknown","positionString":"1:901923","chrPosition":901923,"alleles":"A/C","uaAlleleCounts":"2/2542","aaAlleleCounts":"52/1934","totalAlleleCounts":"54/4476","uaAlleleAndCount":"A=2/C=2542","aaAlleleAndCount":"A=52/C=1934","totalAlleleAndCount":"A=54/C=4476","uaMAF":0.0786,"aaMAF":2.6183,"totalMAF":1.1921,"avgSampleReadDepth":35,"geneList":"PLEKHN1","snpFunction":{"chromosome":"1","position":901923,"conservationScore":"1.0","conservationScoreGERP":"5.0","snpFxnList":[{"mrnaAccession":"NM_032129","fxnClassGVS":"missense","aminoAcids":"SER,ARG","proteinPos":"4/612","cdnaPos":12,"pphPrediction":"probably-damaging","granthamScore":"110"}],"refAllele":"C","ancestralAllele":"C","firstRsId":0,"secondRsId":0,"filters":"PASS","clinicalLink":"unknown"},"altAlleles":"A","hasAtLeastOneAccession":"true","rsIds":"none"}],"setOfSiteCoverageInfo":[{"chromosome":"1","position":901923,"avgSampleReadDepth":35.0,"totalSamplesCovered":2280,"eaSamplesCovered":1272,"avgEaSampleReadDepth":32.0,"aaSamplesCovered":1008,"avgAaSampleReadDepth":38.0},{"chromosome":"1","position":901924,"avgSampleReadDepth":35.0,"totalSamplesCovered":2283,"eaSamplesCovered":1273,"avgEaSampleReadDepth":32.0,"aaSamplesCovered":1010,"avgAaSampleReadDepth":38.0}]}
1 902069 rs116147894 {"start":902069,"chromosome":"1","stop":902070,"strand":"+","snpList":[{"chromosome":"1","conservationScore":"0.0","conservationScoreGERP":"1.0","refAllele":"T","ancestralAllele":"T","filters":"PASS","clinicalLink":"unknown","positionString":"1:902069","chrPosition":902069,"alleles":"C/T","uaAlleleCounts":"2/320","aaAlleleCounts":"18/212","totalAlleleCounts":"20/532","uaAlleleAndCount":"C=2/T=320","aaAlleleAndCount":"C=18/T=212","totalAlleleAndCount":"C=20/T=532","uaMAF":0.6211,"aaMAF":7.8261,"totalMAF":3.6232,"avgSampleReadDepth":13,"geneList":"PLEKHN1","snpFunction":{"chromosome":"1","position":902069,"conservationScore":"0.0","conservationScoreGERP":"1.0","snpFxnList":[{"mrnaAccession":"NM_032129","fxnClassGVS":"intron","aminoAcids":"none","proteinPos":"NA","cdnaPos":-1,"pphPrediction":"unknown","granthamScore":"NA"}],"refAllele":"T","ancestralAllele":"T","firstRsId":0,"secondRsId":0,"filters":"PASS","clinicalLink":"unknown"},"altAlleles":"C","hasAtLeastOneAccession":"true","rsIds":"none"}],"setOfSiteCoverageInfo":[{"chromosome":"1","position":902069,"avgSampleReadDepth":13.0,"totalSamplesCovered":304,"eaSamplesCovered":169,"avgEaSampleReadDepth":13.0,"aaSamplesCovered":135,"avgAaSampleReadDepth":12.0},{"chromosome":"1","position":902070,"avgSampleReadDepth":12.0,"totalSamplesCovered":338,"eaSamplesCovered":190,"avgEaSampleReadDepth":13.0,"aaSamplesCovered":148,"avgAaSampleReadDepth":12.0}]}
1 902108 rs62639981 {"start":902108,"chromosome":"1","stop":902109,"strand":"+","snpList":[{"chromosome":"1","conservationScore":"0.0","conservationScoreGERP":"-8.7","refAllele":"C","ancestralAllele":"unknown","filters":"PASS","clinicalLink":"unknown","positionString":"1:902108","chrPosition":902108,"alleles":"T/C","uaAlleleCounts":"5/333","aaAlleleCounts":"0/248","totalAlleleCounts":"5/581","uaAlleleAndCount":"T=5/C=333","aaAlleleAndCount":"T=0/C=248","totalAlleleAndCount":"T=5/C=581","uaMAF":1.4793,"aaMAF":0.0,"totalMAF":0.8532,"avgSampleReadDepth":13,"geneList":"PLEKHN1","snpFunction":{"chromosome":"1","position":902108,"conservationScore":"0.0","conservationScoreGERP":"-8.7","snpFxnList":[{"mrnaAccession":"NM_032129","fxnClassGVS":"coding-synonymous","aminoAcids":"none","proteinPos":"36/612","cdnaPos":108,"pphPrediction":"unknown","granthamScore":"NA"}],"refAllele":"C","ancestralAllele":"unknown","firstRsId":62639981,"secondRsId":0,"filters":"PASS","clinicalLink":"unknown"},"altAlleles":"T","hasAtLeastOneAccession":"true","rsIds":"rs62639981"}],"setOfSiteCoverageInfo":[{"chromosome":"1","position":902108,"avgSampleReadDepth":13.0,"totalSamplesCovered":294,"eaSamplesCovered":170,"avgEaSampleReadDepth":13.0,"aaSamplesCovered":124,"avgAaSampleReadDepth":13.0},{"chromosome":"1","position":902109,"avgSampleReadDepth":13.0,"totalSamplesCovered":309,"eaSamplesCovered":177,"avgEaSampleReadDepth":13.0,"aaSamplesCovered":132,"avgAaSampleReadDepth":13.0}]}
(...)
That's it
Pierre

01 November 2011

The paper about BioStar has been published in "PLoS Computational Biology"

The article describing BioStar has been published in PLoS Computational Biology:

BioStar: An Online Question & Answer Resource for the Bioinformatics Community


Laurence D. Parnell, Pierre Lindenbaum, Khader Shameer, Giovanni Marco Dall'Olio, Daniel C. Swan, Lars Juhl Jensen, Simon J. Cockell, Brent S. Pedersen, Mary E. Mangan, Christopher A. Miller, Istvan Albert. 2011
PLoS Comput Biol 7(10)
http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1002216
Giovanni has already blogged about this paper here, and on my side, I've collected some tweets about this paper.

Many thanks to all the Biostar users and to the contributors of this paper.

That's it
Pierre