13 December 2006

JAVA 1.6 Mustang , Scripting and Bioinformatics.

Java 1.6 has been released and it is now open source. Among the new features of java, there is an embedded sql engine and a scripting engine. I've to tested this later one to see if I could create a simple filter for fasta sequences just like awk do with regular files. What is interesting is that you can call some (complex) methods already coded in java (see below with the trivial static functions 'reverseComplement' and 'gcPercent'.

The source code using the new Scripting API is at the bottom of this post.

It was compiled like this:

javac org/lindenb/sandox/FastaAWK.java

and here is a test: the program takes as input somes fasta sequences and it only prints the one where (the length is greater than 900 pb or the GC percent is lower than 0.45 or (the reverse complement contains ATGCTTCTTG and the name contains Xenopus)).

cat ~/roxan.fasta | java org/lindenb/sandox/FastaAWK "FastaAWK.gcPercent(sequence)< 0.45 || sequence.length>90 ||  (FastaAWK .reverseComplement(sequence).toUpperCase().indexOf('ATGCTTCTTG')!=-1 && name.indexOf('Xenopus')!=-1 )"


>gi|27592135|gb|CB017399.1|CB017399 pgn1c.pk016.a18 Chicken lymphoid cDNA librar
y (pgn1c) Gallus gallus cDNA clone pgn1c.pk016.a18 5' similar to ref|XP_176823.1
similar to Rotavirus X associated non-structural protein (RoXaN) [Mus musculus]
ref|XP_193795.1| similar to Rotavirus X associated non-structural protein (RoXa
N) [Mus musculus], mRNA sequence
GGAAGGGCTGCCCCACCATTCATCCTTTTCTCGTAGTTTGTGCACGGTGC
GGGAGGTTGTCTGAGTGACTTCACGGGTCGCCTTTGTGCAGTACTAGATA
TGCAGCAGACCTATGACATGTGGCTAAAGAAACACAATCCTGGGAAGCCT
GGAGAGGGAACACCACTCACTTCGCGAGAAGGGGAGAAACAGATCCAGAT
GCCCACTGACTATGCTGACATCATGATGGGCTACCACTGCTGGCTCTGCG
GGAAGAACAGCAACAGCAAGAAGCAATGGCAGCAGCACATCCAGTCAGAG
AAGCACAAGGAGAAGGTCTTCACCTCAGACAGTGACTCCAGCTGCTGGAG
CTATCGCTTCCCTATGGGCGAGTTCCAGCTCTGTGAAAGGTACCATGCAC
ATGGCTCTGTTTGATCCCAGAAGTGATGACTACTTAGTGGTAAAAACACA
TTTCCAGACACACAACTTCAGAAAATGAGTGCAAGCTTCAAGTCTGCCCT
TTGTAGCCATAATGTGCTCAGCTCTCGGTCTGCTGAACAGAGTCTACTTG
GCTCAATTCTTGGGGGAATCCCAGATGCTTTATTAGATTGTTTGAATGTC
TCACGCCCTCTGAATCAGTGCCTTGAGGTGCCTTCAGAAGGCTTGTGATG
GTTAGNNNTNGCATTTTGGTT
>gi|13675786|gb|BG625273.1|BG625273 pgn1c.pk002.d1 Normalized chicken lymphoid c
DNA library Gallus gallus cDNA clone pgn1c.pk002.d1 5' similar to gb|AAF05541.1|
AF188530_1 (AF188530) ubiquitous tetratricopeptide containing protein RoXaN [Hom
o sapiens]G, mRNA sequence
CAATTGATGACATCGAAACAGACTGCTCTATGGACCTGCAGTGCCTGCCA
GCTCCTGTGGCCACCTCCATCTCTGTGAGCGAGGGGCTGTCCCCTTTGCA
(...)




The source:

package org.lindenb.sandox;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import javax.script.Compilable;
import javax.script.CompiledScript;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineFactory;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import javax.script.SimpleBindings;

/**
* my first test for java 1.6 mustang/ ScriptEngine
* @author Pierre Lindenbaum PhD
*
* example: cat *.fasta | java org.lindenb.sandox.FastaAWK "FastaAWK.gcPercent(sequence)<0.45 || (sequence.length>900) || (FastaAWK .reverseComplement(sequence).toUpperCase().indexOf('ATGCTTCTTG')!=-1 && name.indexOf('Xenopus')!=-1 );"
*
*/

public class FastaAWK {
/** header of fasta sequence */
private String name=null;
/** dna sequence */
private StringBuilder sequence= new StringBuilder();
/** compiled user script */
private CompiledScript compiledScript=null;


/** constructor
* initialize the statements
* @param statements
*/

public FastaAWK(String args[]) throws ScriptException
{
//copy statements to add the 'importClass'
String statements[]= new String[args.length+1];
//import this class to get a handle on gcPercent and reverseComplement
statements[0]="importClass(Packages."+this.getClass().getName()+");";
System.arraycopy(args, 0, statements, 1, args.length);

//get a javascript engine
ScriptEngineManager sem = new ScriptEngineManager();
ScriptEngine scriptEngine = sem.getEngineByName("js");
ScriptEngineFactory scriptEngineFactory= scriptEngine.getFactory();
String program = scriptEngineFactory.getProgram(statements);
//compile this program
this.compiledScript=((Compilable) scriptEngine).compile(program);
}

/**
*
* @param sequence the dna sequence
* @return the GC%
*/

public static double gcPercent(String sequence)
{
int n=0;
for(int i=0;i< sequence.length();++i)
{
char base= Character.toUpperCase(sequence.charAt(i));
n+=(base=='G' || base=='C'?1:0);
}
return (double)(n/(double)sequence.length());
}

/**
* return the reverse complement of a sequence
* @param sequence the dna sequence
* @return the reverse complement of a sequence
*/

public static String reverseComplement(String sequence)
{
StringBuilder b= new StringBuilder();
for(int i=sequence.length()-1;i>=0;--i)
{
switch(sequence.charAt(i))
{
case 'A': b.append('T');break;
case 'T': b.append('A');break;
case 'G': b.append('C');break;
case 'C': b.append('G');break;
case 'a': b.append('t');break;
case 't': b.append('a');break;
case 'g': b.append('c');break;
case 'c': b.append('g');break;
default: b.append('N');break;
}
}

return b.toString();
}

/**
* print the current fasta sequence if the compiled script return true
* @throws ScriptException
*/

public void eval() throws ScriptException
{
/* bind name and sequence to javascript variable 'name' and 'seq
uence'*/

SimpleBindings bindings= new SimpleBindings();
bindings.put("name", this.name);
bindings.put("sequence", this.sequence.toString());
//invoke the script with the current binding and get the result
Object o=this.compiledScript.eval(bindings);
if(o==null || !(o instanceof Boolean ) ) return;
//if the result is true: print the fasta sequence
Boolean b=(Boolean)o;
if(b.equals(Boolean.FALSE))
{
return;
}
System.out.print(name);
for(int i=0;i< sequence.length();++i)
{
if(i%50==0) System.out.println();
System.out.print(sequence.charAt(i));
}
System.out.println();

}

/**
* @param args
*/

public static void main(String[] args)
{
try {
FastaAWK awk= new FastaAWK(args);
//loop over fasta sequences
BufferedReader in= new BufferedReader(new InputStreamReader(System.in));
String line=null;
while((line=in.readLine())!=null)
{
if(line.startsWith(">"))
{
if(awk.sequence.length()>0)
{
awk.eval();
}
awk.sequence.setLength(0);
awk.name=line.trim();
}
else
{
awk.sequence.append(line.trim());
}
}
if(awk.sequence.length()>0)
{
awk.eval();
}

in.close();
}
catch (Throwable e)
{
e.printStackTrace();
}

}

}




Pierre Lindenbaum

PS: hey I put my latest presentation (in french) for the first time on slideshare.net !

No comments: