27 September 2012

Playing with the #Ensembl REST API: filtering a VCF with javascript

The new Ensembl REST API was announced today: "We are pleased to announce the beta release of our programming language agnostic REST API, for Release 68 data, at http://beta.rest.ensembl. Our initial release provides access to:

  • Sequences (genomic, cDNA, CDS and protein)
  • VEP (Variant Effect Predictor)
  • Homologies
  • Gene Trees
  • Assembly and coordinate mapping."

In the current post I will filter a VCF file with this API and javascript.

The VCF

Our initial file is the following VCF:
##fileformat=VCFv4.0
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
#CHROM POS ID REF ALT QUAL FILTER INFO
1 10327 rs112750067 T C . PASS DP=65
1 69427 rs148502021 T A . PASS DP=1557
1 69452 rs142004627 A G . PASS DP=155
1 69475 rs148502021 C T . PASS DP=231
1 865583 rs148711625 A G . PASS DP=231
1 866460 rs148884928 A G . PASS DP=23
1 866461 . G A . PASS DP=24

The javascript

The VCF will be read on the standard input using the following script and the Rhino JS engine. The script reads the VCF and for each substitution, it calls the Ensembl REST API, parses the JSON response and return the transcript identifier if the mutation is a missense_variant or a polyphen_prediction="probably damaging":
importPackage(java.io);
importPackage(java.lang);



function sleep(milliseconds)
  {
  /* hacked from http://www.phpied.com/sleep-in-javascript/ */
  var start = new Date().getTime();
  for (var i = 0; i < 1e7; i++) {
    if ((new Date().getTime() - start) > milliseconds){
      break;
      }
    }
  }


function damagingTranscript(json)
 {
 for(var d in json.data)
  {
  var transcripts=json.data[d].transcripts;
 
  if(!transcripts) return null;
  for(var t in transcripts)
   {
   var transcript=transcripts[t];
   for(var a in transcript.alleles)
    {
    var allele=transcript.alleles[a];
    if(allele.polyphen_prediction=="probably damaging" ||
       allele.consequence_terms.indexOf("missense_variant")!=-1 )
        {
        return transcript.transcript_id;
        }
    }
   }
  }
 return null;
 }

var baseRegex=new RegExp(/^[ATGCatgc]$/);



var stdin = new java.io.BufferedReader( new java.io.InputStreamReader(java.lang.System['in']) );
var line;
while((line=stdin.readLine())!=null)
 {
 if(line.startsWith("#"))
  {
  print(line); continue;
  }
 var tokens=line.split("\t");
 var chrom=tokens[0];
 var pos= parseInt(tokens[1]);
 var ref= tokens[3];
 var alt= tokens[4];
 if(!baseRegex.test(ref)) continue;
 if(!baseRegex.test(alt)) continue;
 
 sleep(200);
 var url="http://beta.rest.ensembl.org/vep/human/"+
  chrom+":"+pos+"-"+pos+"/"+alt+
  "/consequences?content-type=application/json";
 var jsonStr=readUrl(url,"UTF-8");
 var json=eval("("+jsonStr+")");
 var transcript=damagingTranscript(json);
 if(transcript==null) continue;
 tokens[7]+=(";TRANSCRIPT="+transcript);
 print(tokens.join('\t'));
 }

As an example, here is the response from the ENSEMBL server for 1:866460 A/G:

http://beta.rest.ensembl.org/vep/human/1:866460-866460/G/consequences?content-type=application/json
{
    "data": [
        {
            "location": {
                "coord_system": "chromosome",
                "name": "1",
                "strand": 1,
                "end": "866460",
                "start": "866460"
            },
            "hgvs": {
                "G": "1:g.866460C>G"
            },
            "transcripts": [
                {
                    "translation_stable_id": "ENSP00000411579",
                    "intron_number": null,
                    "cdna_end": 385,
                    "translation_end": 99,
                    "exon_number": "4/7",
                    "is_canonical": 0,
                    "transcript_id": "ENST00000420190",
                    "cdna_start": 385,
                    "gene_id": "ENSG00000187634",
                    "cds_start": 296,
                    "translation_start": 99,
                    "alleles": {
                        "C/G": {
                            "sift_prediction": "deleterious",
                            "polyphen_prediction": "benign",
                            "polyphen_score": 0.001,
                            "display_codon_allele_string": "gCg/gGg",
                            "hgvs_protein": "ENSP00000411579.1:p.Ala99Gly",
                            "sift_score": 0.04,
                            "consequence_terms": [
                                "missense_variant"
                            ],
                            "pep_allele_string": "A/G",
                            "codon_allele_string": "GCG/GGG",
                            "hgvs_transcript": "ENST00000420190.1:c.296C>G"
                        }
                    },
                    "name": "SAMD11",
                    "biotype": "protein_coding",
                    "cds_end": 296,
                    "cdna_allele_string": "C/G",
                    "codon_position": 2
                },
                {
                    "translation_stable_id": "ENSP00000393181",
                    "intron_number": null,
                    "cdna_end": 356,
                    "translation_end": 99,
                    "exon_number": "4/5",
                    "is_canonical": 0,
                    "transcript_id": "ENST00000437963",
                    "cdna_start": 356,
                    "gene_id": "ENSG00000187634",
                    "cds_start": 296,
                    "translation_start": 99,
                    "alleles": {
                        "C/G": {
                            "sift_prediction": "deleterious",
                            "polyphen_prediction": "benign",
                            "polyphen_score": 0.001,
                            "display_codon_allele_string": "gCg/gGg",
                            "hgvs_protein": "ENSP00000393181.1:p.Ala99Gly",
                            "sift_score": 0.03,
                            "consequence_terms": [
                                "missense_variant"
                            ],
                            "pep_allele_string": "A/G",
                            "codon_allele_string": "GCG/GGG",
                            "hgvs_transcript": "ENST00000437963.1:c.296C>G"
                        }
                    },
                    "name": "SAMD11",
                    "biotype": "protein_coding",
                    "cds_end": 296,
                    "cdna_allele_string": "C/G",
                    "codon_position": 2
                },
                {
                    "translation_stable_id": "ENSP00000342313",
                    "intron_number": null,
                    "cdna_end": 379,
                    "translation_end": 99,
                    "exon_number": "4/14",
                    "is_canonical": 1,
                    "transcript_id": "ENST00000342066",
                    "cdna_start": 379,
                    "gene_id": "ENSG00000187634",
                    "cds_start": 296,
                    "alleles": {
                        "C/G": {
                            "sift_prediction": "deleterious",
                            "polyphen_prediction": "benign",
                            "polyphen_score": 0.001,
                            "display_codon_allele_string": "gCg/gGg",
                            "hgvs_protein": "ENSP00000342313.3:p.Ala99Gly",
                            "sift_score": 0.01,
                            "consequence_terms": [
                                "missense_variant"
                            ],
                            "pep_allele_string": "A/G",
                            "codon_allele_string": "GCG/GGG",
                            "hgvs_transcript": "ENST00000342066.3:c.296C>G"
                        }
                    },
                    "translation_start": 99,
                    "name": "SAMD11",
                    "biotype": "protein_coding",
                    "cds_end": 296,
                    "cdna_allele_string": "C/G",
                    "ccds": "CCDS2.2",
                    "codon_position": 2
                },
                {
                    "translation_stable_id": "ENSP00000349216",
                    "intron_number": null,
                    "cdna_end": 67,
                    "translation_end": 23,
                    "exon_number": "2/12",
                    "is_canonical": 0,
                    "transcript_id": "ENST00000341065",
                    "cdna_start": 67,
                    "gene_id": "ENSG00000187634",
                    "cds_start": 68,
                    "translation_start": 23,
                    "alleles": {
                        "C/G": {
                            "sift_prediction": "deleterious",
                            "polyphen_prediction": "benign",
                            "polyphen_score": 0.008,
                            "display_codon_allele_string": "gCg/gGg",
                            "hgvs_protein": "ENSP00000349216.4:p.Ala23Gly",
                            "sift_score": 0.01,
                            "consequence_terms": [
                                "missense_variant"
                            ],
                            "pep_allele_string": "A/G",
                            "codon_allele_string": "GCG/GGG",
                            "hgvs_transcript": "ENST00000341065.4:c.67C>G"
                        }
                    },
                    "name": "SAMD11",
                    "biotype": "protein_coding",
                    "cds_end": 68,
                    "cdna_allele_string": "C/G",
                    "codon_position": 2
                }
            ]
        }
    ]
}

Invoking RHINO, filtering the VCF

$ cat input.vcf | rhino -f restensembl.js
  
##fileformat=VCFv4.0
##INFO=
#CHROM POS ID REF ALT QUAL FILTER INFO
1 69427 rs148502021 T A . PASS DP=1557;TRANSCRIPT=ENST00000335137
1 69452 rs142004627 A G . PASS DP=155;TRANSCRIPT=ENST00000335137
1 69475 rs148502021 C T . PASS DP=231;TRANSCRIPT=ENST00000335137
1 865583 rs148711625 A G . PASS DP=231;TRANSCRIPT=ENST00000420190
1 866460 rs148884928 A G . PASS DP=23;TRANSCRIPT=ENST00000420190

Limitations for the Ensembl REST API:




That's it,

Pierre

18 September 2012

Describing protein-protein interactions in XML: customizing the xsd-schema with JAXB, my notebook.

Say, you want to describe a network of protein-protein interaction using a XML format. Your XML schema will contain a set of

  • Articles/References
  • Proteins
  • Interactions
Here is a simple XSD schema for this model:
<?xml version="1.0" encoding="UTF-8"?>
<schema xmlns="http://www.w3.org/2001/XMLSchema" xmlns:tns="http://www.example.org/" targetNamespace="http://www.example.org/" elementFormDefault="qualified">
  <complexType name="article">
    <sequence>
      <element name="title" type="string"/>
      <element name="year" type="gYear"/>
    </sequence>
    <attribute name="pmid" type="ID" use="required"/>
  </complexType>
  <complexType name="protein">
    <sequence>
      <element name="acn" type="ID"/>
      <element name="description" type="string"/>
    </sequence>
  </complexType>
  <complexType name="interaction">
    <sequence>
      <element name="pmids" type="int" minOccurs="1" maxOccurs="unbounded"/>
      <element name="proteins" type="IDREF" minOccurs="1" maxOccurs="unbounded"/>
    </sequence>
  </complexType>
  <complexType name="interactome">
    <sequence>
      <element name="article" type="tns:article" minOccurs="0" maxOccurs="unbounded"/>
      <element name="protein" type="tns:protein" minOccurs="0" maxOccurs="unbounded"/>
      <element name="interaction" type="tns:interaction" minOccurs="0" maxOccurs="unbounded"/>
    </sequence>
  </complexType>
  <element name="interactome" type="tns:interactome"/>
</schema>
Here, the attibutes 'type="ID"' and 'type="IDREF"' are used to link the entities (One protein can be part of several interactions,....).
One can generate the java classes for those types using: ${JAVA_HOME}/bin/xjc:
$ xjc  interactome.xsd
parsing a schema...
compiling a schema...
org/example/Article.java
org/example/Interaction.java
org/example/Interactome.java
org/example/ObjectFactory.java
org/example/Protein.java
org/example/package-info.java
Problem: xjc doesn't know the exact nature of the links created between ID and IDREF. What kind of object should return the method 'getProteins' of the class 'Interaction' ? In consequence, xjc generates the following code:

$ more org/example/Interaction.java

    (...)
    protected List<JAXBElement<Object>> proteins;
    (...)
    public List<JAXBElement<Object>> getProteins()

We can tell xjc about those link by creating a binding file (JXB). In the following file, we tell XJC that the entities linked by 'proteins' should be some instances of 'Protein':
<?xml version="1.0" encoding="UTF-8"?>
<jxb:bindings xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xjc="http://java.sun.com/xml/ns/jaxb/xjc" xmlns:jxb="http://java.sun.com/xml/ns/jaxb" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" jxb:version="2.1">
  <jxb:bindings schemaLocation="interactome.xsd">
    <jxb:bindings node="/xs:schema/xs:complexType[@name=' interaction']/xs:sequence">
      <jxb:bindings node="xs:element[@name=' proteins']">
        <jxb:property>
          <jxb:baseType name="Protein"/>
        </jxb:property>
      </jxb:bindings>
    </jxb:bindings>
  </jxb:bindings>
</jxb:bindings>

Invoking XJC with the bindings:
$ xjc -b interactome.jxb  interactome.xsd
parsing a schema...
compiling a schema...
org/example/Article.java
org/example/Interaction.java
org/example/Interactome.java
org/example/ObjectFactory.java
org/example/Protein.java
org/example/package-info.java

The generated class 'Interaction.java' now contains the correct java type:


$ more org/example/Interaction.java

   (...)
    protected List<Protein> proteins;
    (...)
    public List<Protein> getProteins() {
     (....)

That's it,
Pierre

Notes about bwa 0.6.2 and the multiple hits.

Here are my notes about the way bwa 0.6.2 handles the multiple hits.


I've created a sub-sequence (~6Mb ) of the human chr22 (see the Makefile below). The sequence doesn't contain any base 'N' or any lowercase (=repeatMasked) letter.
1E6 reads (forward and reverse) have been generated from this ~chr22 with samtools wgsim. No mutation and no sequencing error have been allowed.

This ~chr22 sequence have been duplicated and renamed as 'DUP'. Both sequences have been merged in one fasta file named 'twoChrom.fa'

The reads have been aligned with BWA (v. 0.6.2-r126) on this genome.


There's only one SAM alignment per read

$ samtools view align.sorted.bam  | wc -l
2000000

1% of the reads were not properly paired/mapped on the genome

$ samtools view  -f 2 align.sorted.bam | wc -l
1999926

The properly mapped reads contain some informations about the alternative hits in 'XA'

$samtools view  -f 2 align.sorted.bam | grep 22_10_402_0 | verticalize -n

>>> 1
$1   22_10_402_0:0:0_0:0:0_c1608
$2   99
$3   22
$4   10
$5   0
$6   70M
$7   =
$8   333
$9   393
$10  GGGACGGTCATGCAATCTGGACAACATTCACCTTTAAAAGTTTATTGATCTTTTGTGACATGCACGTGGG
$11  IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
$12  XT:A:R
$13  NM:i:0
$14  SM:i:0
$15  AM:i:0
$16  X0:i:2
$17  X1:i:0
$18  XM:i:0
$19  XO:i:0
$20  XG:i:0
$21  MD:Z:70
$22  XA:Z:DUP,+10,70M,0;
<<< 1

>>> 2
$1   22_10_402_0:0:0_0:0:0_c1608
$2   147
$3   22
$4   333
$5   0
$6   70M
$7   =
$8   10
$9   -393
$10  ATACCTGCCAGATGAGTCACTGGCAAAAGGTGCTGCTCCCTGGTGAGGGAGAAACACCAGGGGCTGGGAG
$11  IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
$12  XT:A:R
$13  NM:i:0
$14  SM:i:0
$15  AM:i:0
$16  X0:i:2
$17  X1:i:0
$18  XM:i:0
$19  XO:i:0
$20  XG:i:0
$21  MD:Z:70
$22  XA:Z:DUP,-333,70M,0;
<<< 2

50% mapped on chr22, 50% mapped on DUP

$ samtools view  -f 2 align.sorted.bam | cut -d '      ' -f 3 | sort | uniq -c
 999860 22
1000066 DUP

Some pairs have been unmapped because the mate have been mapped on the other chromosome !

$ samtools-0.1.18/samtools view  -F 2 align.sorted.bam | grep 22_1210321_1210924 | verticalize -n

>>> 1
$1   22_1210321_1210924_0:0:0_0:0:0_e28f5
$2   81
$3   22
$4   1210855
$5   0
$6   70M
$7   DUP
$8   1210321
$9   0
$10  AGAGACTGAGTCCGGCTAGAGAACAGGGTGGAGCCCCTTTGGACCTTAGAGCTGGGCCTTTGGGCCTTGG
$11  IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
$12  XT:A:R
$13  NM:i:0
$14  SM:i:0
$15  AM:i:0
$16  X0:i:6
$17  X1:i:0
$18  XM:i:0
$19  XO:i:0
$20  XG:i:0
$21  MD:Z:70
$22  XA:Z:22,-1996342,70M,0;DUP,-1996342,70M,0;22,+2551778,70M,0;DUP,+2551778,70M,0;DUP,-1210855,70M,0;
<<< 1

>>> 2
$1   22_1210321_1210924_0:0:0_0:0:0_e28f5
$2   161
$3   DUP
$4   1210321
$5   0
$6   70M
$7   22
$8   1210855
$9   0
$10  CCCGGGCCGGATGGCTCGCCTGCGCGGCCAGCTCCGGGCCGAAGCGGCTTCGCGGTCCGAGGTGCCGCGG
$11  IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
$12  XT:A:R
$13  NM:i:0
$14  SM:i:0
$15  AM:i:0
$16  X0:i:2
$17  X1:i:0
$18  XM:i:0
$19  XO:i:0
$20  XG:i:0
$21  MD:Z:70
$22  XA:Z:22,+1210321,70M,0;
<<< 2

The Makefile

SAMDIR=/usr/local/package/samtools-0.1.18
SAMTOOLS=$(SAMDIR)/samtools
BCFTOOLS=$(SAMDIR)/bcftools/bcftools
BWA=/usr/local/package/bwa-0.6.2/bwa
REF1=chr22.fa
REF2=twoChrom.fa

.INTERMEDIATE : align.sam random_1.sai random_2.sai align.bam variations.bcf

%.bam : %.sam
        $(SAMTOOLS) view -o $@ -b -S -T $(REF2) $<
%.bam.bai : %.bam
        $(SAMTOOLS) index $<



align.sorted.bam : align.bam
        $(SAMTOOLS) sort $< align.sorted


align.sam : random_1.sai random_2.sai  
        $(BWA) sampe -a 600 $(REF2) $^ random_1.fq.gz random_2.fq.gz > $@

$(REF1):
        curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/$(REF1).gz" |\
        gunzip -c | sed 's/^>chr/>/' | grep -v '[Nnatgc]' | head -n 100000 > $@


random_1.sai :  random_1.fq.gz $(REF2).bwt
        $(BWA) aln -f $@ $(REF2) $<

random_2.sai :  random_2.fq.gz $(REF2).bwt
        $(BWA) aln -f $@ $(REF2) $<

random_1.fq.gz random_2.fq.gz : $(REF1)
        $(SAMDIR)/misc/wgsim -N 1000000 -e 0.0 -r 0.0 -d 400 $< random_1.fq random_2.fq > wgsim.output
        gzip -f --best random_1.fq random_2.fq

$(REF2): $(REF1)
        cp $< $@
        sed 's/^>.*/>DUP/' $< >> $@

$(REF2).bwt : $(REF2)
        $(BWA) index -a bwtsw $<

$(REF2).fai :  $(REF2)
        $(SAMTOOLS) faidx $< 



clean:
        rm -f chr22.* *.bam *.vcf *.bcf *.sai *.gz *.fq *.bai  wgsim.output *.sam
That's it,

Pierre

13 September 2012

Translating a DNA sequence in Google Spreadsheet using #GoogleAppScript

Google has released Google App Script.
"Google Apps Script is a JavaScript cloud scripting language that lets you extend Google Apps and build web applications. Scripts are developed in Google Apps Script’s browser-based script editor, and they are stored in and run from Google's servers.
Google Apps Script is very versatile. Here are some examples of things you can do with Google Apps Script":
  • Build custom functions in a Google Spreadsheet
  • Extend certain Google Apps products by creating custom menus linked to scripts
  • Create and publish web applications, which can run on their own or embedded within a Google Site
  • Schedule tasks like report creation and distribution and run them on a custom schedule
  • Automate workflows such as document or expense approvals, order fulfillment, time-tracking, and more

In the current post, I'll show how to create a custom javascript function that will

Translate a
DNA
to a
Protein
into a
Google Spreadsheet

Create a new Google Spreadhseet. Open the menu "Tools" > "Script Manager...". Click on New...

Click on "Create Blank Project".

A new editor is opened. Copy the following javascript code (https://gist.github.com/3716137) into the editor. Save the javascript projet.

Close the script, go back to the spreasheet. You can now use your new function =translateDNA(dna):

That's it,

Pierre

05 September 2012

Customizing the java classes for the NCBI generated by XJC

Reminder: XJC is the Java XML Binding Compiler. It automates the mapping between XML documents and Java objects:

XSD (aka XML-schema)
+
XJC
=
JAVA Classes

The code generated by XJC allows to :
  • Unmarshal XML content into a Java representation
  • Access and update the Java representation
  • Marshal the Java representation of the XML content into XML content

For example, the following XML-Schema (tinyseq.xsd) describes a TinySeq-XML document returned by the NCBI.

<?xml version="1.0" encoding="UTF-8"?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
  <xs:annotation>
    <xs:documentation> XML schema for NCBI tinyseq format</xs:documentation>
  </xs:annotation>
  <xs:complexType name="TSeqSet_t">
    <xs:annotation>
      <xs:documentation>Set of sequences</xs:documentation>
    </xs:annotation>
    <xs:sequence>
      <xs:element ref="TSeq" maxOccurs="unbounded"/>
    </xs:sequence>
  </xs:complexType>
  <xs:complexType name="TSeq_t">
    <xs:annotation>
      <xs:documentation>A Tiny Sequence</xs:documentation>
    </xs:annotation>
    <xs:sequence>
      <xs:element name="TSeq_seqtype">
        <xs:complexType>
          <xs:attribute name="value">
            <xs:simpleType>
              <xs:restriction base="xs:string">
                <xs:enumeration value="nucleotide"/>
                <xs:enumeration value="protein"/>
              </xs:restriction>
            </xs:simpleType>
          </xs:attribute>
        </xs:complexType>
      </xs:element>
      <xs:element name="TSeq_gi" type="xs:long"/>
      <xs:element name="TSeq_accver" type="xs:string"/>
      <xs:element name="TSeq_sid" type="xs:string"/>
      <xs:element name="TSeq_taxid" type="xs:long"/>
      <xs:element name="TSeq_orgname" type="xs:string"/>
      <xs:element name="TSeq_defline" type="xs:string"/>
      <xs:element name="TSeq_length" type="xs:nonNegativeInteger"/>
      <xs:element name="TSeq_sequence" type="xs:string"/>
    </xs:sequence>
  </xs:complexType>
  <xs:element name="TSeqSet" type="TSeqSet_t"/>
  <xs:element name="TSeq" type="TSeq_t"/>
</xs:schema>

This xml-schema can be compiled with XJC:

${JAVA_HOME}/bin/xjc -d . -p generated tinyseq.xsd
parsing a schema...
compiling a schema...
generated/ObjectFactory.java
generated/TSeqSetT.java
generated/TSeqT.java

$ more generated/TSeqT.java

package generated;
(...)
@XmlAccessorType(XmlAccessType.FIELD)
@XmlType(name = "TSeq_t", propOrder = {  "tSeqSeqtype",  "tSeqGi",  "tSeqAccver","tSeqSid",(...),"tSeqSequence"})
public class TSeqT {
    @XmlElement(name = "TSeq_seqtype", required = true)
    protected TSeqT.TSeqSeqtype tSeqSeqtype;
    @XmlElement(name = "TSeq_gi")
    protected long tSeqGi;
    @XmlElement(name = "TSeq_accver", required = true)
    protected String tSeqAccver;
    @XmlElement(name = "TSeq_sid", required = true)
    protected String tSeqSid;
    @XmlElement(name = "TSeq_taxid")
    protected long tSeqTaxid;
    @XmlElement(name = "TSeq_orgname", required = true)
    protected String tSeqOrgname;
    @XmlElement(name = "TSeq_defline", required = true)
    protected String tSeqDefline;
    @XmlElement(name = "TSeq_length", required = true)
    @XmlSchemaType(name = "nonNegativeInteger")
    protected BigInteger tSeqLength;
    @XmlElement(name = "TSeq_sequence", required = true)
    protected String tSeqSequence;
(...)
    }

But XJC doesn't know how to generate some classical java functions like 'hashCode', 'equals' or 'toString' or to add some custom methods to your classes.

Hopefully the standard distribution of XJC comes with a plugin named -Xinject-code whch injects some custom code in the classes generated by XJC.

XSD (aka XML-schema)
+
java xml binding (jxb)
+
XJC
=
Customized JAVA Classes

For example, if we want to add a toString method to the class TSeqT, we're going to write the following "java xml binding file" (jxb) which alters the initial xml schema:

<?xml version="1.0" encoding="UTF-8"?>
<jxb:bindings 
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:xjc="http://java.sun.com/xml/ns/jaxb/xjc"
xmlns:jxb="http://java.sun.com/xml/ns/jaxb"
xmlns:ci="http://jaxb.dev.java.net/plugin/code-injector"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
jxb:extensionBindingPrefixes="ci "
jxb:version="2.1"

>

<jxb:bindings schemaLocation="tinyseq.xsd">
        <!-- here we use an XPATH expression to tell xjc about which part
 of the XML schema we want to change -->
 <jxb:bindings node="/xs:schema/xs:complexType[@name='TSeq_t']">
  <ci:code>

 /** toString : returns the gi and the defline  */
 public String toString()
  {
  return "gi:"+getTSeqGi()+"|"+getTSeqDefline();
  }

</ci:code>
 </jxb:bindings>
</jxb:bindings>

</jxb:bindings>

Below, I wrote a larger JXB file 'tinyseq.jxb' which injects the following methods:
  • 'equals' method for TSeq
  • 'hashCode' method for TSeq
  • 'toString' method for TSeq
  • 'printAsFasta' method for TSeq
  • 'getTSeqSetbyId' method for TSeqSet. A static function fetching a TinySeq sequence from the NCBI for a given 'gi'
  • a 'main' method for TSeqSet. It loops over a list of 'gi's, fetches the sequences (using NCBI-EFetch) and prints the sequences as FASTA
<?xml version="1.0" encoding="UTF-8"?>
<jxb:bindings 
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:xjc="http://java.sun.com/xml/ns/jaxb/xjc"
xmlns:jxb="http://java.sun.com/xml/ns/jaxb"
xmlns:ci="http://jaxb.dev.java.net/plugin/code-injector"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
jxb:extensionBindingPrefixes="ci "
jxb:version="2.1"

>

<jxb:bindings schemaLocation="tinyseq.xsd">
 <jxb:bindings node="/xs:schema/xs:complexType[@name='TSeq_t']">
  <ci:code>
 /* print this sequence as fasta */
 public void printAsFasta(java.io.PrintStream out)
  {
  String s=getTSeqSequence();
  out.print(">gi:"+getTSeqGi()+"|"+ getTSeqAccver() +"|"+getTSeqDefline());
  for(int i=0;i &lt; s.length();++i)
   {
   if(i%60==0) out.println();
   out.print(s.charAt(i));
   }
  out.println();
  }

 /** equals: two  TSeq are equal if they have the same gi */
 @Override
 public boolean equals(Object o)
  {
  if(o==this) return true;
  if(o==null || o.getClass()!=this.getClass()) return false;
  return this.getTSeqGi()==TSeqT.class.cast(o).getTSeqGi();
  }

 /** hashCode : use gi */
 @Override
 public int hashCode()
  {
  return  (int)(this.getTSeqGi()^(this.getTSeqGi()>>>32));
  }

 /** toString : returns the gi and the defline  */
 public String toString()
  {
  return "gi:"+getTSeqGi()+"|"+getTSeqDefline();
  }

</ci:code>
 </jxb:bindings>




 <jxb:bindings node="/xs:schema/xs:complexType[@name='TSeqSet_t']">
  <ci:code>
 /** get TSeqSetT from a given gi */
 public static TSeqSetT getTSeqSetbyId(long gi)
  throws javax.xml.bind.JAXBException, javax.xml.bind.UnmarshalException , java.io.IOException
  {
  /** find the JAXB context in the defined path */
  javax.xml.bind.JAXBContext jc = javax.xml.bind.JAXBContext.newInstance(TSeqSetT.class,TSeqT.class);
  javax.xml.bind.Unmarshaller u = jc.createUnmarshaller();
  String uri="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&amp;rettype=fasta&amp;retmode=xml&amp;id="+gi;
  /** read the sequence */
  return u.unmarshal(new javax.xml.transform.stream.StreamSource(uri),TSeqSetT.class).getValue();
  }

 /** main: takes a list of gi and prints the sequences as fasta */
 public static void main(String args[]) throws Exception
  {
  for(int optind=0;optind &lt; args.length;++optind)
   {
   TSeqSetT tss=TSeqSetT.getTSeqSetbyId(Long.parseLong(args[optind]));
   for(generated.TSeqT seq:tss.getTSeq()) seq.printAsFasta(System.out);
   }
  }

</ci:code>

</jxb:bindings>

</jxb:bindings>

</jxb:bindings>

Compile the schema, compile the java classes and execute

$ xjc -target 2.1 -verbose -Xinject-code -extension -d . -p generated -b tinyseq.jxb tinyseq.xsd
parsing a schema...
compiling a schema...
[INFO] generating code
unknown location

generated/ObjectFactory.java
generated/TSeqSetT.java
generated/TSeqT.java

$ javac generated/*.java
$ java generated.TSeqSetT 25 26 27
>gi:25|X53813.1|Blue Whale heavy satellite DNA
TAGTTATTCAACCTATCCCACTCTCTAGATACCCCTTAGCACGTAAAGGAATATTATTTG
GGGGTCCAGCCATGGAGAATAGTTTAGACACTAGGATGAGATAAGGAACACACCCATTCT
AAAGAAATCACATTAGGATTCTCTTTTTAAGCTGTTCCTTAAAACACTAGAGTCTTAGAA
ATCTATTGGAGGCAGAAGCAGTCAAGGGTAGCCTAGGGTTAGGGTTAGGCTTAGGGTTAG
GGTTAGGGTACGGCTTAGGGTACTGTTTCGGGGAGGGGTTCAGGTACGGCGTAGGGTATG
GGTTAGGGTTAGGGTTAGGGTTAGTGTTAGGGTTAGGGCTCGGTTTAGGGTACGGGTTAG
GATTAGGGTACGTGTTAGGGTTAGGGTAGGGCTTAGGGTTAGGGTACGTGTTAGGGTTAG
GG
>gi:26|X53814.1|Blue Whale heavy satellite DNA
TAGTTATTAAACCTATCCCACTCTCTAGATACACCTTAGCACGTAAAGGAATATTATTTG
GGGGTCCAGACATGGAGAAGAGTTTAGACACTAGGATAAGATAAGGAACACACCCATTCT
AAAGAAATCACATTAGGATTCTCTTTTTAAGCTGTTCCTTAAAACTCTAGTGCTTAGGAA
ATCTATTGGAGGCAGAAGCAGTCAAGGGTAGCCTAGGGTTAGGGTTAGGCTTATGGTTAG
GGCTAGGGTACGGCTTAGGGTACGGATTCGGGGAGGGGTTCGGGTACGGCGTAGGGTATG
GGTTAGGGTTAGCGTTAGTGTTAGGGTTAGGGCTCGGTTTAGGGTACGGGTTAGGATTAG
GGTACGTGTTAGGGTTAGGGTAGGGGTTAGGGTTAGGGTACGCGTTAGGGTTAGGG
>gi:27|Z18633.1|B.physalus gene for large subunit rRNA
AACCAGTATTAGAGCACTGCCTGCCCGGTGACTAATCGTTAAACGGCCGCGGTATCCTGA
CCGTGCAAAGGTAGCATAATCACTTGTTCTCTAATTAGGGACTTGTATGAATGGCCACAC
GAGGGTTTTACTGTCTCTTACTTTTAATCAGTGAAATTGACCTCTCCGTGAAGAGGCGGA
GATAACAAAATAAGACGAGAAGACCCTATGGAGCTTCAATTAATCAACCCAAAAACCATA
ACCTTAAACCACCAAGGGATAACAAAACCTTATATGGGCTGACAATTTCGGTTGGGGTGA
CCTCGGAGTACAAAAAACCCTCCGAGTGATTAAAACTTAGGCCCACTAGCCAAAGTACAA
TATCACTTATTGATCCAATCCTTTGATCAACGGAACAAGTTACCCTAGGGATAACAGCGC
AATCCTATTCTAGAGTCCATATCGACAATAGGGTTTACGACCTCGATGTTGGATCAGGAC
ATCCTAATGGTGCAGCTGCTATTAAGGGTTCGTTTGTT
That's it,


Pierre