31 July 2009

A RESTful Web service storing DNA sequences: Jersey, my notebook


Jersey is the open source JAX-RS implementation for building RESTful Web services. JAX-RS uses java annotations to simplify the development and deployment of web service clients and endpoints. In this post I'll describe how I've implemented a naive RESTful web service for storing and querying a DNA database. This code was tested and deployed under netbeans 6.1.

The service is defined in a class named FastaRest. An annotation @Path gives the root of our services. The value of UriInfo is injected by @Context (this interface provides an access to the application and requests information about the current URI). Our DNA database is just an associative array mapping the DNA sequence to its id (integer).

Building the Resource


@Path("/fastadb/") /** application path */
public class FastaRest {
/** injectable interface that provides access to application and request URI information */
@Context
private UriInfo context;

/** our DNA database */
private static final Map<Integer,String> SEQUENCES= new HashMap<Integer, String>();
(...)
}

Adding a DNA sequence


A DNA is inserted using a PUT method. The relative path must be /add and two parameters (@FormParam= id, an integer and @FormParam= seq, the DNA sequence) must be provided. A JSON response is returned with a message telling if the new sequence was added.
@PUT
@Path("/add")
@ConsumeMime("application/x-www-form-urlencoded")
@ProduceMime("application/json")
public String putSequence(
@FormParam("id")int id,
@FormParam("seq")String seq
)
{
if(!seq.matches("[atgcATGC]+"))
{
return "{'status':'error','message':'illegal sequence'}";
}

/** the static instance of SEQUENCE may be shared by multiple service. Let's lock it */
synchronized(SEQUENCES)
{
if (SEQUENCES.containsKey(id))
{
return "{'status':'error','message','already exists'}";
}
else
{
SEQUENCES.put(id, seq);
return "{'status':'ok','id',"+id+"}";
}
}
}

Retrieving a DNA sequence


The Fasta sequence of a given DNA is returned to the client using a GET method with the relative path /seq/{id-of-the-sequence}.
@GET
@Path("/seq/{id}")
@ProduceMime("text/plain")
public String getSequenceById(@PathParam("id") int id)
{
String seq=null;
synchronized(SEQUENCES)
{
seq=SEQUENCES.get(id);
}
if(seq!=null)
{
return ">"+id+"\n"+seq+"\n";
}
return "";
}

Dump all the DNA sequences

The fasta sequences of all the DNA is returned to the client using a GET method with the relative path /seqs. As this result may be huge, a StreamingOutput object is returned rather than a String.
@GET
@Path("/seqs")
@ProduceMime("text/plain")
public StreamingOutput getSequences()
{
return new StreamingOutput()
{
public void write(OutputStream out) throws IOException, WebApplicationException
{
synchronized(SEQUENCES)
{
PrintStream w= new PrintStream(out);

for(Integer id: SEQUENCES.keySet())
{
w.print(">"+id+"\n"+SEQUENCES.get(id)+"\n");
}
w.flush();
}
}
};
}

Test


Add a sequence
curl -X PUT -d 'id=1&seq=AAATAGCTAGTCGACGATCGTAG' "http://localhost:17370/REST01/resources/fastadb/add"
{'status':'ok','id',1}

Add a second sequence
curl -X PUT -d 'id=2&seq=AGCTAGAGCGGCTATATGC' "http://localhost:17370/REST01/resources/fastadb/add"
{'status':'ok','id',2}

Try to re-insert sequence id=2
curl -X PUT -d 'id=2&seq=AAAA' "http://localhost:17370/REST01/resources/fastadb/add"
{'status':'error','message','already exists'}

Try to insert something that is not a DNA sequence
curl -X PUT -d 'id=3&seq=NotADnaSequence' "http://localhost:17370/REST01/resources/fastadb/add"
{'status':'error','message':'illegal sequence'}

Retrieve sequence id=1
curl "http://localhost:17370/REST01/resources/fastadb/seq/1"
>1
AAATAGCTAGTCGACGATCGTAG

Fetch all
curl "http://localhost:17370/REST01/resources/fastadb/seqs"
>1
AAATAGCTAGTCGACGATCGTAG
>2
AGCTAGAGCGGCTATATGC



That's it !

Source code

import com.sun.jersey.api.representation.FormParam;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Map;
import javax.ws.rs.Path;
import javax.ws.rs.PathParam;
import javax.ws.rs.GET;
import javax.ws.rs.PUT;
import javax.ws.rs.ProduceMime;
import javax.ws.rs.ConsumeMime;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.StreamingOutput;
import javax.ws.rs.core.UriInfo;

@Path("/fastadb/")
public class FastaRest {
@Context
private UriInfo context;
private static final Map<Integer,String> SEQUENCES= new HashMap<Integer, String>();

public FastaRest() {
}


@GET
@Path("/seq/{id}")
@ProduceMime("text/plain")
public String getSequenceById(@PathParam("id") int id)
{
String seq=null;
synchronized(SEQUENCES)
{
seq=SEQUENCES.get(id);
}
if(seq!=null)
{
return ">"+id+"\n"+seq+"\n";
}
return "";
}


@GET
@Path("/seqs")
@ProduceMime("text/plain")
public StreamingOutput getSequences()
{
return new StreamingOutput()
{
public void write(OutputStream out) throws IOException, WebApplicationException
{
synchronized(SEQUENCES)
{
PrintStream w= new PrintStream(out);

for(Integer id: SEQUENCES.keySet())
{
w.print(">"+id+"\n"+SEQUENCES.get(id)+"\n");
}
w.flush();
}
}
};
}




@PUT
@Path("/add")
@ConsumeMime("application/x-www-form-urlencoded")
@ProduceMime("application/json")
public String putSequence(@FormParam("id")int id,@FormParam("seq")String seq)
{
if(!seq.matches("[atgcATGC]+"))
{
return "{'status':'error','message':'illegal sequence'}";
}


synchronized(SEQUENCES)
{
if (SEQUENCES.containsKey(id))
{
return "{'status':'error','message','already exists'}";
}
else
{
SEQUENCES.put(id, seq);
return "{'status':'ok','id',"+id+"}";
}
}
}
}

web.xml :

<?xml version="1.0" encoding="UTF-8"?>
<web-app version="2.5" xmlns="http://java.sun.com/xml/ns/javaee" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd">
<servlet>
<servlet-name>ServletAdaptor</servlet-name>
<servlet-class>com.sun.jersey.spi.container.servlet.ServletContainer</servlet-class>
<load-on-startup>1</load-on-startup>
</servlet>
<servlet-mapping>
<servlet-name>ServletAdaptor</servlet-name>
<url-pattern>/resources/*</url-pattern>
</servlet-mapping>
<session-config>
<session-timeout>
30
</session-timeout>
</session-config>
<welcome-file-list>
<welcome-file>index.jsp</welcome-file>
</welcome-file-list>
</web-app>

27 July 2009

Don't translate this ! Security with Spring, my notebook

In my previous post titled "SpringFramework/BeanFactory My notebook" I showed how to use the Spring Framework to manage a set of java beans with a xml file and a BeanFactory.

Now, let's add a drop of Security in our model.

Let's say you're working for a big paranoid pharma company where the user "lindenb" is not allowed to use the class spring.Translator with the standard genetic code. A naive approach is to extends spring.Translate

package spring;

public class TranslateSecurity extends Translate{
@Override
public String translate(CharSequence sequence)
{
if("lindenb".equals(System.getProperty("user.name")))
{
Translate geneticCode= Translate.class.cast(jp.getTarget());
if(geneticCode.getName().equals("Standard Code"))
{
throw new SecurityException(
"User lindenb is not allowed to use the genetic code \""+geneticCode.getName()+"\" !"
);
}
}
return super.translate(sequence);
}
}
But this strategy is not maintainable because you'll also have to copy this code in all the other classes implementing spring.Translate such as spring.NoProlineTranslate.

Here comes The Aspect Oriented Programming (AOP). You can think of AOP as a dynamic decorator design pattern. The decorator pattern allows additional behavior to be added to an existing class by wrapping the original class and duplicating its interface and then delegating to the original.

Let's create an Aspect:
package spring;

import org.aspectj.lang.JoinPoint;
import org.aspectj.lang.annotation.Aspect;
import org.aspectj.lang.annotation.Before;

@Aspect
public class SecurityAspect
{
@Before("execution(* spring.Translator.*(..))")
public void checkSecurity(JoinPoint jp)
{
if("lindenb".equals(System.getProperty("user.name")))
{
Translate geneticCode= Translate.class.cast(jp.getTarget());
if(geneticCode.getName().equals("Standard Code"))
{
throw new SecurityException(
"User lindenb is not allowed to use the genetic code \""+geneticCode.getName()+"\" !"
);
}
}
}
}
This Aspect calls spring.SecurityAspect.checkSecurity() each time before calling any method of spring.Translator. This aspect is plugged our original code just by adding a few nodes in the original beans.xml file.
<beans>
(...)
<aop:aspectj-autoproxy proxy-target-class="true"/>
<bean id="aspect01" class="spring.SecurityAspect"/>
(...)
</beans>


Compile & Execute as user "lindenb"

mkdir -p build
javac -d build -cp ${SPRING}/dist/spring.jar:${SPRING}/lib/aspectj/aspectjrt.jar -sourcepath src src/spring/*.java
java -cp build:${SPRING}/dist/spring.jar:${SPRING}/lib/jakarta-commons/commons-logging.jar:${SPRING}/lib/aspectj/aspectjrt.jar:${SPRING}/lib/aspectj/aspectjweaver.jar:${SPRING}/lib/cglib/cglib-nodep-2.1_3.jar spring.SprintTest01

Jul 27, 2009 3:55:38 PM org.springframework.context.support.AbstractApplicationContext prepareRefresh
INFO: Refreshing org.springframework.context.support.ClassPathXmlApplicationContext@5e3974: display name [org.springframework.context.support.ClassPathXmlApplicationContext@5e3974]; startup date [Mon Jul 27 15:55:38 CEST 2009]; root of context hierarchy
Jul 27, 2009 3:55:38 PM org.springframework.beans.factory.xml.XmlBeanDefinitionReader loadBeanDefinitions
INFO: Loading XML bean definitions from class path resource [beans.xml]
Jul 27, 2009 3:55:38 PM org.springframework.context.support.AbstractApplicationContext obtainFreshBeanFactory
INFO: Bean factory for application context [org.springframework.context.support.ClassPathXmlApplicationContext@5e3974]: org.springframework.beans.factory.support.DefaultListableBeanFactory@c44b88
Jul 27, 2009 3:55:38 PM org.springframework.beans.factory.support.DefaultListableBeanFactory preInstantiateSingletons
INFO: Pre-instantiating singletons in org.springframework.beans.factory.support.DefaultListableBeanFactory@c44b88: defining beans [gencode1,gencode2,gencode3,listOfGenCodes,org.springframework.aop.config.internalAutoProxyCreator,aspect01]; root of factory hierarchy
java.lang.SecurityException: User lindenb is not allowed to use the genetic code "Standard Code" !
at spring.SecurityAspect.checkSecurity(SecurityAspect.java:18)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.springframework.aop.aspectj.AbstractAspectJAdvice.invokeAdviceMethodWithGivenArgs(AbstractAspectJAdvice.java:627)
at org.springframework.aop.aspectj.AbstractAspectJAdvice.invokeAdviceMethod(AbstractAspectJAdvice.java:609)
at org.springframework.aop.aspectj.AspectJMethodBeforeAdvice.before(AspectJMethodBeforeAdvice.java:39)
at org.springframework.aop.framework.adapter.MethodBeforeAdviceInterceptor.invoke(MethodBeforeAdviceInterceptor.java:49)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:171)
at org.springframework.aop.interceptor.ExposeInvocationInterceptor.invoke(ExposeInvocationInterceptor.java:89)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:171)
at org.springframework.aop.framework.Cglib2AopProxy$DynamicAdvisedInterceptor.intercept(Cglib2AopProxy.java:635)
at spring.Translate$$EnhancerByCGLIB$$75c8a9e3.getName(<generated>)
at spring.SprintTest01.main(SprintTest01.java:20)


This security layer was added in our model without modifying the original classes.
This security layer was added in our model without modifying the original classes.
This security layer was added in our model without modifying the original classes.
This security layer was added in our model without modifying the original classes
This security layer was added in our model without modifying the original classes


That's it.
Pierre

Drawing a circular genome. Chapter 2: java swing

This post follows my previous post Ajax/PHP/Mysql/Canvas Drawing a circular genome, my notebook. The problem here, is drawing a circular genomic map that might contain a huge number of data and using an asynchronous method to fetch and display the data. Here, the server returning some JSON data is the same as in the last post but I now use a Java Swing client to fetch and display the data. Here again, the code is just a draft and I wouldn't write my final code like that.

The client is a javax.swing.JFrame. When the frame is opened, it opens a new Thread calling the server and fetching the JSON data (I previously described a JSON parser here). Once the data have been fetched, it can be only drawn in the Swing-Thread (all code that might affect or depend on the state of that component should be executed in this event-dispatching thread), that's why the drawing area is painted inside a SwingUtilities.invokeLater call.

import java.awt.Color;
import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.LinearGradientPaint;
import java.awt.RenderingHints;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.awt.geom.GeneralPath;
import java.awt.image.BufferedImage;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import javax.swing.JFrame;
import javax.swing.JPanel;
import javax.swing.SwingUtilities;


import org.lindenb.json.Parser;
import org.lindenb.swing.SwingUtils;

/**
* JAVA-Swing implementation of http://plindenbaum.blogspot.com/2009/07/ajaxphpmysqlcanvas-drawing-circular.html
* This is just a proof of Concept
*
* @author lindenb
*
*/
public class CircularGenome extends JFrame
{
private static final double CHR1_LENGTH =248000000.0;
private static final long serialVersionUID = 1L;

/**
* Drawing thread. call the JSON server and draw the density
*
*/
private class ParseMapping
implements Runnable
{
//json url to be called
private String url;
private int step;//UGLY, are we drawing snp of genes ?
private List<Integer> counts=null;
ParseMapping(int step,String url)
{
this.url=url;
this.step=step;
}

@Override
public void run()
{
try
{
Parser parser= new Parser();
//call the PHP server and retrieve the density of objects for this track
Object o=parser.parse(new URL(this.url).openStream());
Map<?,?> map=Map.class.cast(o);
List<?> L=List.class.cast(map.get("counts"));
this.counts= new ArrayList<Integer>(L.size());
for(Object c:L)
{
this.counts.add(Number.class.cast(Map.class.cast(c).get("count")).intValue());
}
/*
* SwingThread: Once a Swing component has been realized,
* all code that might affect or depend on the state of that component
* should be executed in the event-dispatching thread.
*/
SwingUtilities.invokeLater(new Runnable()
{
@Override
public void run()
{

Graphics2D g= offscreen.createGraphics();
g.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON);

double radius= area.getWidth()/2.0;
double r1= radius/2.0;
if(step==1)
{
r1+= 2+radius/4.0;
}
//get max densisty
double max=0;
for(int i=0;i< counts.size();++i)
{
if(counts.get(i) > max) max= counts.get(i);
}

//loop over the items
for(int i=0;i< counts.size();++i)
{
double a1= Math.PI*2.0*i/counts.size();
double a2= Math.PI*2.0*(i+1)/counts.size();

double r2= r1+(counts.get(i)/max)*(radius/4.0);
//draw the item
GeneralPath path= new GeneralPath();
path.moveTo( radius + Math.cos(a1)*r1, radius + Math.sin(a1)*r1);
path.lineTo( radius + Math.cos(a1)*r2, radius + Math.sin(a1)*r2);
path.lineTo( radius + Math.cos(a2)*r2, radius + Math.sin(a2)*r2);
path.lineTo( radius + Math.cos(a2)*r1, radius + Math.sin(a2)*r1);
path.closePath();
g.setColor(step==0?Color.RED:Color.YELLOW);
g.fill(path);
g.setColor(Color.BLACK);
g.draw(path);
}
g.dispose();
//repaint the drawing area
area.repaint();

if(step!=0) return;
//call a new Thread for the Gene
Thread t= new Thread(new ParseMapping(1,
"http://localhost/lindenb/ucsc.php?length="+windowLength+
"&database=knownGene")
);
t.start();
}
});
}
catch (Exception e)
{
e.printStackTrace();
}
}
}

/** offscreen image where we paint the tracks */
private BufferedImage offscreen=null;
/** drawing area */
private JPanel area=null;
/** step size */
private int windowLength;

public CircularGenome()
{
setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE);
setBounds(50, 50, 800, 800);
setResizable(false);

area=new JPanel(null)
{
private static final long serialVersionUID = 1L;
//paint the drawing area in this panel
@Override
protected void paintComponent(Graphics g) {
g.drawImage(getOffscreen(),0, 0,area);
}
};
area.setOpaque(true);
setContentPane(area);
/* once the window is opened,
* call the first thread
*/
addWindowListener(new WindowAdapter()
{
@Override
public void windowOpened(WindowEvent e)
{

double perimeter= 2*Math.PI*(area.getWidth()/4.0);
windowLength = (int)Math.round(CHR1_LENGTH/perimeter)*4;
Thread t= new Thread(new ParseMapping(0,
"http://localhost/lindenb/ucsc.php?length="+windowLength+"&database=snp129")
);
t.start();
}
});
}

/** get the offscreen picture, create it, if it doesn't exist */
private BufferedImage getOffscreen()
{
if(this.offscreen==null)
{
this.offscreen=new BufferedImage(
area.getWidth(),
area.getHeight(),
BufferedImage.TYPE_INT_RGB
);
//prepare the picture, add a gradient for the background
LinearGradientPaint paint= new LinearGradientPaint(
this.getWidth()/2f,0,this.getWidth()/2f,this.getHeight(),
new float[]{0f,1f},
new Color[]{Color.WHITE,Color.BLACK}
);
Graphics2D g= this.offscreen.createGraphics();
g.setPaint(paint);
g.fillRect(0, 0, area.getWidth(), area.getHeight());
g.dispose();
}
return this.offscreen;
}



public static void main(String[] args) {
try {
CircularGenome g= new CircularGenome();
SwingUtils.show(g);
} catch (Exception e) {
e.printStackTrace();
}
}

}



The result looks like the same as in the previous javascript client.

No I wonder it it would be worth trying to implement this using Java-FX.

That's it.
Pierre

24 July 2009

Ajax/PHP/Mysql/Canvas Drawing a circular genome, my notebook.

I've been asked to draw a circular map of the genome. Some tools already exist, for example circos, a Perl program.



Jan Aerts is also writing pARP, a circular genome browser using Ruby and ruby-processing:


My data are stored in big database and it might take some time before all the data are processed and displayed. So my idea was to call the server with some asynchronous ajax queries, retrieve the chunks of data and display each chunk as soon it is returned by the server as soon as it is available.

The code below is a proof of concept. This code is ugly, I wouldn't code things like this for a real piece of software. As a source of data I've used the snp129 and the knownGene tables of the UCSC stored in a mysql database. The server was implemented using PHP.

Client Side

When the document is loaded, the <canvas> element is resized. A first AJAX query is sent to retrieve an array of density of the SNPs on the human chromosome 1. The JSON response is processed, the maximum number of SNPs is found and each item of this array is displayed on the canvas. After that, a second AJAX query is sent to retrieve the density of the genes.
<html xmlns="http://www.w3.org/1999/xhtml"><head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"/>
<script><![CDATA[
/** the canvas element */
var canvas = null;
/** radius of the canvas */
var radius=500;
/** AJAX request */
var httpRequest=null;
/** Graphics context */
var g=null;
/** length of chrom1 */
var CHR1_LENGTH =248000000.0;
/** window length (pb) */
var windowLength=0;
/** first track is snp129 */
var database="snp129";

/** ajax callback */
function paintSnps()
{
if (httpRequest.readyState == 4) {
// everything is good, the response is received
if (httpRequest.status == 200)
{
var jsondata=eval("("+httpRequest.responseText+")");
var counts=jsondata.counts;
//get the maximum of item
var max=0;
for(var i=0;i< counts.length;++i)
{
if(counts[i].count > max) max= counts[i].count*1.0;
}
var r1= radius/2.0;
if(database=="knownGene")
{
r1+= 2+radius/4.0;
}
//loop over the items
for(var i=0;i< counts.length;++i)
{
var a1= Math.PI*2.0*i/(1.0*counts.length);
var a2= Math.PI*2.0*(i+1)/(1.0*counts.length);

var r2= r1+(counts[i].count/max)*(radius/4.0);
//draw the item
g.beginPath();
g.moveTo( radius + Math.cos(a1)*r1, radius + Math.sin(a1)*r1);
g.lineTo( radius + Math.cos(a1)*r2, radius + Math.sin(a1)*r2);
g.lineTo( radius + Math.cos(a2)*r2, radius + Math.sin(a2)*r2);
g.lineTo( radius + Math.cos(a2)*r1, radius + Math.sin(a2)*r1);
g.stroke();
g.fill();
}
//if it was snp, then look for knownGene, change the coors
if(database=="snp129")
{
database="knownGene";
g.fillStyle = "yellow";
g.strokeStyle = "blue";
setTimeout("fetchDB()",100);
}
}
else
{
//boum!!
}
}
else {
// still not ready
}

}

/** calls the AJAX request */
function fetchDB()
{
httpRequest= new XMLHttpRequest();
httpRequest.onreadystatechange = paintSnps;
httpRequest.open('GET', 'ucsc.php', true);
httpRequest.send("length="+windowLength+"database="+database);

}

/** init document */
function init()
{
canvas=document.getElementById("genome");
//resize canvas
canvas.setAttribute("width",2*radius);
canvas.setAttribute("height",2*radius);
if (!canvas.getContext) return;
g = canvas.getContext('2d');
//paint background
var lineargradient = g.createLinearGradient(radius,0,radius,2*radius);
lineargradient.addColorStop(0,'white');
lineargradient.addColorStop(1,'black');
g.fillStyle = lineargradient;
g.fillRect(0,0,2*radius,2*radius);
g.strokeStyle = "black";
g.strokeRect(0,0,2*radius,2*radius);
g.fillStyle = "red";
g.strokeStyle = "green";

var perimeter= 2*Math.PI*(radius/2.0);
windowLength = Math.round(CHR1_LENGTH/perimeter);

//launch the first ajax request
setTimeout("fetchDB()",100);
}


]]></script>
</head><body onload="init();">
<canvas id="genome" />
</body></html>

The server

The (ugly) PHP page is a simple script returning the density of the objects mapped on the chromosome 1 for a given table.
<?php
$con=NULL;

function cleanup()
{
if($con!=NULL) mysql_close($con);
flush;
exit;
}

header('Cache-Control: no-cache, must-revalidate');
header('Content-type: application/json');
header("Content-Disposition: attachment; filename=\"result.json\"");
header('Content-type: text/plain');

$con = mysql_connect('localhost', 'anonymous', '');
if (!$con) {
echo "{status:'Error',message:'". mysql_error()."'}";
cleanup();
}
if(!mysql_select_db('hg18', $con))
{
echo "{status:'Error',message:'cannot select db'}";
cleanup();
}
$database="snp129";
if(isset($_GET["database"]))
{
$database=$_GET["database"];
}


$length=1E6;
if(isset($_GET["length"]))
{
$length= (int)$_GET["length"];
}
if($length<=0) $length=1E6;

$nameStart="chromStart";
if($database=="knownGene")
{
$nameStart="txStart";
}


$sql="SELECT CAST(ROUND(".$nameStart."/".$length.") AS SIGNED INTEGER )*".$length.",count(*) from ".$database." where ".
" chrom=\"chr1\" ".
" group by CAST(ROUND(".$nameStart."/".$length.") AS SIGNED INTEGER )*".$length.
" order by 1"
;

$result = mysql_query($sql ,$con );

if(!$result)
{
echo "{status:'Error',message:'".mysql_error($con) ."'}";
cleanup();
}

$found=FALSE;


echo "{status:'OK',";
echo "length:".$length.",";
echo "counts:[";

while ($row = mysql_fetch_array($result))
{
if($found) echo ",\n";
$found=TRUE;
echo "{chromStart:".$row[0].",count:".$row[1]."}";
}

echo "]}";

cleanup();

?>
And here is the kind of JSON document returned by the server:
{status:'OK',
length:1000000,
counts:[
{chromStart:0,count:6191},
{chromStart:1000000,count:8897},
{chromStart:2000000,count:5559},
{chromStart:3000000,count:6671},
{chromStart:4000000,count:6398},
{chromStart:5000000,count:5462},
{chromStart:6000000,count:5678},
{chromStart:7000000,count:4737},
{chromStart:8000000,count:5313},
{chromStart:9000000,count:5148},
{chromStart:10000000,count:4055},
{chromStart:11000000,count:5012},
{chromStart:12000000,count:5363},
{chromStart:13000000,count:10165},

(...)

{chromStart:239000000,count:5502},
{chromStart:240000000,count:6173},
{chromStart:241000000,count:7928},
{chromStart:242000000,count:3800},
{chromStart:243000000,count:5503},
{chromStart:244000000,count:7120},
{chromStart:245000000,count:6148},
{chromStart:246000000,count:6015},
{chromStart:247000000,count:5337}
]
}

Result




That's it

PS: Hum, yes I know , it's not as fast/beautiful as GenoDive that was introduced at Biohackathon.



Pierre

23 July 2009

SpringFramework/BeanFactory My notebook.

This post is my notebook for programming with the Spring Framework.

(wikipedia)The Spring Framework is an open source application framework for the Java platform. Central to the Spring Framework is its Inversion of Control container, which provides a consistent means of configuring and managing Java objects using callbacks. The container is responsible for managing object lifecycles: creating objects, calling initialization methods, and configuring objects by wiring them together. Objects created by the container are also called Managed Objects or Beans. Typically, the container is configured by loading XML files containing Bean definitions which provide the information required to create the beans.

OK, say you have been asked to write a Java program translating a DNA sequence with a default genetic code. This program will also list all the available genetic codes. As, in this software, the translated sequences should not contain any Proline, we also want to test this fact in our development version. First we need an Interface named spring.Translator :

package spring;
public interface Translator {
/** answers the name of this translator */
public String getName();
/** answers the translation of this DNA sequence */
public String translate(CharSequence sequence);
}
A basic implementation of this interface use an ordered String of amino acids to translate the DNA.
package spring;

public class Translate
implements Translator
{
private String geneticCode;
private String name;

public void setCode(String geneticCode) {
this.geneticCode = geneticCode;
}

public void setName(String name) {
this.name = name;
}

public String getName()
{
return this.name;
}

private int base2index(char c)
{
switch(Character.toLowerCase(c))
{
case 't': return 0;
case 'c': return 1;
case 'a': return 2;
case 'g': return 3;
default: return -1;
}
}

@Override
public String translate(CharSequence sequence) {
StringBuilder b= new StringBuilder(1+sequence.length()/3);
for(int i=0;i+2< sequence.length();i+=3)
{
int base1= base2index(sequence.charAt(i));
int base2= base2index(sequence.charAt(i+1));
int base3= base2index(sequence.charAt(i+2));
if(base1==-1 || base2==-1 || base3==-1)
{
b.append('?');
}
else
{
b.append(this.geneticCode.charAt(base1*16+base2*4+base3));
}
}
return b.toString();
}
}
We also create a class called spring.NoProlineTranslate for our test:
package spring;

public class NoProlineTranslate extends Translate{
@Override
public String translate(CharSequence sequence)
{
String s=super.translate(sequence);
if(s.contains("P")) throw new IllegalArgumentException("Sequence should not contain any Proline");
return s;
}
}
And in a file called beans.xml we define our java beans. Here we define four beans:
  • A Translate object for the standard genetic code
  • A Translate object for the Mitochondrial code
  • An instance of spring.NoProlineTranslate
  • And a java.util.List of two Genetic Codes
  • We also define an alias "defaultcode" for the default genetic code
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:util="http://www.springframework.org/schema/util"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
http://www.springframework.org/schema/util http://www.springframework.org/schema/util/spring-util-2.5.xsd
http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-2.5.xsd
http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-2.5.xsd" >


<bean id="gencode1" class="spring.Translate">
<property name="name" value="Standard Code"/>
<property name="code" value="FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"/>
</bean>
<bean id="gencode2" class="spring.Translate">
<property name="name" value="Vertebrate Mitochondrial Code"/>
<property name="code" value="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG"/>
</bean>
<bean id="gencode3" class="spring.NoProlineTranslate">
<property name="name" value="Vertebrate Mitochondrial Code"/>
<property name="code" value="FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"/>
</bean>

<alias name="gencode1" alias="defaultcode"/>

<util:list id="listOfGenCodes" list-class="java.util.ArrayList">
<ref bean="gencode1"/>
<ref bean="gencode2"/>
</util:list>
</beans>
And here is the program creating each instance
package spring;
import java.util.List;

import org.springframework.beans.factory.BeanFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;

public class SprintTest01
{
public static void main(String[] args)
{
try
{
ApplicationContext context =
new ClassPathXmlApplicationContext("beans.xml");
BeanFactory factory = (BeanFactory) context;

/* list the genetic codes in the list "listOfGenCodes" */
for(Translate translator:(List<Translate>) factory.getBean("listOfGenCodes"))
{
System.out.println("Genetic code available: "+translator.getName());
}
/* translate with the default genetic code defined in beans.xml */
Translate geneticCode=(Translate) factory.getBean("defaultcode");
System.out.println("Default code is \""+ geneticCode.getName()+"\" : "+geneticCode.getClass());
System.out.println(geneticCode.translate(
"ATGGAGAGGCAGAAACGGAAGGCGGACATCGAGAAG"+
"GGGCTGCAGTTCATTCAGTCGACACTACCCCTAAAGCAAGAAGAGTATGAGGCCTTTCTGCTCAAGCTGG"+
"TGCAGAATCTGTTTGCTGAGGGCAATGATCTGTTCCGGGAGAAGGACTATAAGCAGGCTCTGGTGCAGTA"+
"CATGGAAGGGCTGAACGTGGCCGACTACGCTGCCTCTGACCAGGTGGCCCTGCCCCGGGAGCTGCTGTGC"
));
}
catch(Throwable err)
{
err.printStackTrace();
}
}
}

Compiling

mkdir -p build
javac -d build -cp ${SPRINGDIR}/dist/spring.jar -sourcepath src src/spring/*.java
java -cp build:${SPRINGDIR}/dist/spring.jar:${SPRINGDIR}/lib/jakarta-commons/commons-logging.jar spring.SprintTest01

Executing

java -cp build:${SPRINGDIR}/dist/spring.jar:${SPRINGDIR}/lib/jakarta-commons/commons-logging.jar spring.SprintTest01
Jul 23, 2009 3:58:13 PM org.springframework.context.support.AbstractApplicationContext prepareRefresh
INFO: Refreshing org.springframework.context.support.ClassPathXmlApplicationContext@1dd46f7: display name [org.springframework.conte
xt.support.ClassPathXmlApplicationContext@1dd46f7]; startup date [Thu Jul 23 15:58:13 CEST 2009]; root of context hierarchy
Jul 23, 2009 3:58:14 PM org.springframework.beans.factory.xml.XmlBeanDefinitionReader loadBeanDefinitions
INFO: Loading XML bean definitions from class path resource [beans.xml]
Jul 23, 2009 3:58:14 PM org.springframework.context.support.AbstractApplicationContext obtainFreshBeanFactory
INFO: Bean factory for application context [org.springframework.context.support.ClassPathXmlApplicationContext@1dd46f7]: org.springf
ramework.beans.factory.support.DefaultListableBeanFactory@1d2fc36
Jul 23, 2009 3:58:14 PM org.springframework.beans.factory.support.DefaultListableBeanFactory preInstantiateSingletons
INFO: Pre-instantiating singletons in org.springframework.beans.factory.support.DefaultListableBeanFactory@1d2fc36: defining beans [
gencode1,gencode2,gencode3,listOfGenCodes]; root of factory hierarchy

Genetic code available: Standard Code
Genetic code available: Vertebrate Mitochondrial Code
Default code is "Standard Code" : class spring.Translate
MERQKRKADIEKGLQFIQSTLPLKQEEYEAFLLKLVQNLFAEGNDLFREKDYKQALVQYMEGLNVADYAASDQVALPRELLC


Ok, and now the test can be run without modifying the code by just changing <alias name="gencode1" alias="defaultcode"/> to <alias name="gencode3" alias="defaultcode"/> in "beans.xml".
java -cp build:${SPRINGDIR}/dist/spring.jar:${SPRINGDIR}/lib/jakarta-commons/commons-logging.jar spring.SprintTest01
Jul 23, 2009 3:58:47 PM org.springframework.context.support.AbstractApplicationContext prepareRefresh
INFO: Refreshing org.springframework.context.support.ClassPathXmlApplicationContext@1dd46f7: display name [org.springframework.conte
xt.support.ClassPathXmlApplicationContext@1dd46f7]; startup date [Thu Jul 23 15:58:47 CEST 2009]; root of context hierarchy
Jul 23, 2009 3:58:47 PM org.springframework.beans.factory.xml.XmlBeanDefinitionReader loadBeanDefinitions
INFO: Loading XML bean definitions from class path resource [beans.xml]
Jul 23, 2009 3:58:47 PM org.springframework.context.support.AbstractApplicationContext obtainFreshBeanFactory
INFO: Bean factory for application context [org.springframework.context.support.ClassPathXmlApplicationContext@1dd46f7]: org.springf
ramework.beans.factory.support.DefaultListableBeanFactory@1d2fc36
Jul 23, 2009 3:58:47 PM org.springframework.beans.factory.support.DefaultListableBeanFactory preInstantiateSingletons
INFO: Pre-instantiating singletons in org.springframework.beans.factory.support.DefaultListableBeanFactory@1d2fc36: defining beans [
gencode1,gencode2,gencode3,listOfGenCodes]; root of factory hierarchy

Genetic code available: Standard Code
Genetic code available: Vertebrate Mitochondrial Code
Default code is "Vertebrate Mitochondrial Code" : class spring.NoProlineTranslate

java.lang.IllegalArgumentException: Sequence should not contains any Proline
at spring.NoProlineTranslate.translate(NoProlineTranslate.java:8)
at spring.SprintTest01.main(SprintTest01.java:25)


That's it.
Pierre

21 July 2009

A simple java-based SVG renderer.

A short post. I've just implemented a simple and small SVG renderer. It works fine with simple Documents.



DocumentBuilderFactory domFactory= DocumentBuilderFactory.newInstance();
domFactory.setCoalescing(true);
domFactory.setExpandEntityReferences(true);
domFactory.setIgnoringComments(true);
domFactory.setNamespaceAware(true);
domFactory.setValidating(false);
domFactory.setNamespaceAware(true);
DocumentBuilder domBuilder= domFactory.newDocumentBuilder();
Document dom=domBuilder.parse("http://upload.wikimedia.org/wikipedia/commons/f/fd/Ghostscript_Tiger.svg");
SVGIcon icon= new SVGIcon(dom,256,256);
JOptionPane.showMessageDialog(null, new JLabel(icon),"SVG Icon",JOptionPane.PLAIN_MESSAGE);




That's it.
Pierre

19 July 2009

3D histograms using CSS -moz-transform

Firefox 3.5 includes a new CSS property called -moz-transform. The -moz-transform CSS property lets you modify the coordinate space of the CSS visual formatting model. Using it, elements can be translated, rotated, scaled, and skewed as this text..

I've used this new property to draw a 3D histogram:

[0,0]
124
99%
[0,1]
95
77%
[0,2]
87
68%
[0,3]
72
54%
[0,4]
60
[0,5]
50
43%
[1,0]
139
78%
[1,1]
137
64%
[1,2]
108
63%
[1,3]
81
[1,4]
67
40%
[1,5]
57
38%
[2,0]
177
59%
[2,1]
137
55%
[2,2]
129
[2,3]
102
40%
[2,4]
80
38%
[2,5]
61
35%
[3,0]
181
58%
[3,1]
167
45%
[3,2]
149
42%
[3,3]
123
33%
[3,4]
117
26%
[3,5]
108
20%
[4,0]
237
45%
[4,1]
194
45%
[4,2]
191
35%
[4,3]
152
26%
[4,4]
123
22%
[4,5]
118
18%
[5,0]
300
40%
[5,1]
235
31%
[5,2]
208
28%
[5,3]
165
23%
[5,4]
149
19%
[5,5]
131
16%


Here is the code for a simple cube:

<-- left pane -->
<div style="position:absolute; -moz-transform-origin: 0px 0px; -moz-transform: translate(300px,200px) rotate(90deg) skew(-45deg); background:gray; font-size:36px; color:white; width:300px; height:40px; border:1px solid black;text-align:right;overflow:hidden;">[0,0]</div>

<-- right pane -->
<div style="position:absolute; -moz-transform-origin: 0px 0px; -moz-transform: translate(340px,160px) rotate(90deg) skew(45deg); background:lightGrey ; font-size:36px; color:white; width:300px; height:40px;border:1px solid black;textalign:right;overflow:hidden;">124</div>

<-- top pane -->
<div style="position:absolute; -moz-transform-origin:0 0; -moz-transform: translate(300px,120px) skew(-45deg, 45deg); background:dimgray ; font-size:18px; color:white; width:40px; height:40px;border:1pxsolid black;text-align:center;overflow:hidden;" title="300">99%</div>


If you don't have firefox 3.5 here a screenshot showing how my browser displays this page (at the top, the same page viewed in Konqueror)


That's it
Pierre

17 July 2009

Indexing and Searching NCBI Genes with Apache Lucene



In this post I'll show how Apache Lucene can be grammatically used to index the content of a set of NCBI Genes entries and how to query and retrieve those data.

(via wikipedia:)Apache Lucene is a free/open source information retrieval java library, It is supported by the Apache Software Foundation. While suitable for any application which requires full text indexing and searching capability, Lucene has been widely recognized for its utility in the implementation of Internet search engines and local, single-site searching. At the core of Lucene's logical architecture is the idea of a document containing fields of text. This flexibility allows Lucene's API to be independent of the file format. Text from PDFs, HTML, Microsoft Word, and OpenDocument documents, as well as many others can all be indexed so long as their textual information can be extracted.

Here my source of data is a set of XML EntrezGene entries related to the initiation of translation and downloaded from the NCBI.

<Entrezgene-Set>

<Entrezgene>
<Entrezgene_track-info>
<Gene-track>
<Gene-track_geneid>1981</Gene-track_geneid>
<Gene-track_status value="live">0</Gene-track_status>
<Gene-track_create-date>
<Date>
(...)
</Date>
</Gene-track_create-date>
</Gene-track>
</Entrezgene_track-info>
<Entrezgene_type value="protein-coding">6</Entrezgene_type>
<Entrezgene_source>
<BioSource>
<BioSource_genome value="genomic">1</BioSource_genome>
<BioSource_origin value="natural">1</BioSource_origin>
<BioSource_org>
<Org-ref>
<Org-ref_taxname>Homo sapiens</Org-ref_taxname>
<Org-ref_common>human</Org-ref_common>
<Org-ref_syn>
<Org-ref_syn_E>man</Org-ref_syn_E>
</Org-ref_syn>
<Org-ref_orgname>
<OrgName>
<OrgName_name>
<OrgName_name_binomial>
<BinomialOrgName>
<BinomialOrgName_genus>Homo</BinomialOrgName_genus>
<BinomialOrgName_species>sapiens</BinomialOrgName_species>
</BinomialOrgName>
</OrgName_name_binomial>
</OrgName_name>
<OrgName_lineage>Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo</OrgName_lineage>
<OrgName_gcode>1</OrgName_gcode>
<OrgName_mgcode>2</OrgName_mgcode>
<OrgName_div>PRI</OrgName_div>
</OrgName>
</Org-ref_orgname>
</Org-ref>
</BioSource_org>
<BioSource_subtype>
<SubSource>
<SubSource_subtype value="chromosome">1</SubSource_subtype>
<SubSource_name>3</SubSource_name>
</SubSource>
</BioSource_subtype>
</BioSource>
</Entrezgene_source>
<Entrezgene_gene>
<Gene-ref>
<Gene-ref_locus>EIF4G1</Gene-ref_locus>
<Gene-ref_desc>eukaryotic translation initiation factor 4 gamma, 1</Gene-ref_desc>
<Gene-ref_maploc>3q27-qter</Gene-ref_maploc>
<Gene-ref_db>
<Dbtag>
(...)
</Dbtag>
</Gene-ref_db>
<Gene-ref_syn>
<Gene-ref_syn_E>p220</Gene-ref_syn_E>
<Gene-ref_syn_E>EIF4F</Gene-ref_syn_E>
<Gene-ref_syn_E>EIF4G</Gene-ref_syn_E>
<Gene-ref_syn_E>DKFZp686A1451</Gene-ref_syn_E>
</Gene-ref_syn>
</Gene-ref>
</Entrezgene_gene>
<Entrezgene_prot>
<Prot-ref>
<Prot-ref_name>
<Prot-ref_name_E>eukaryotic translation initiation factor 4 gamma, 1</Prot-ref_name_E>
<Prot-ref_name_E>EIF4-gamma</Prot-ref_name_E>
</Prot-ref_name>
<Prot-ref_desc>eukaryotic translation initiation factor 4 gamma, 1</Prot-ref_desc>
</Prot-ref>
</Entrezgene_prot>
<Entrezgene_summary>The protein encoded by this gene is a component of the protein complex EIF4F, which is involved in the recognition of the mRNA cap, ATP-dependent unwinding of 5'-terminal secondary structure, and recruitment of mRNA to the ribosome. Alternative splicing results in five transcript variants encoding four distinct isoforms. [provided by RefSeq]</Entrezgene_summary>
<Entrezgene_location>
<Maps>
<Maps_display-str>3q27-qter</Maps_display-str>
<Maps_method>
<Maps_method_map-type value="cyto"/>
</Maps_method>
</Maps>
</Entrezgene_location>
<Entrezgene_gene-source>
<Gene-source>
<Gene-source_src>LocusLink</Gene-source_src>
<Gene-source_src-int>1981</Gene-source_src-int>
<Gene-source_src-str2>1981</Gene-source_src-str2>
</Gene-source>
</Entrezgene_gene-source>
<Entrezgene_xtra-index-terms>
<Entrezgene_xtra-index-terms_E>LOC1981</Entrezgene_xtra-index-terms_E>
</Entrezgene_xtra-index-terms>
</Entrezgene>
</Entrezgene-Set>



Indexing the XML


We create a new standard Analyser breaking the sentences using English stop words.
Analyzer analyzer=new StandardAnalyzer();

An IndexWriter uses this Analyser and creates and maintains the index.

IndexWriter indexWriter=new IndexWriter(
this.luceneDir,//working directory
analyzer,
true,//create
IndexWriter.MaxFieldLength.UNLIMITED //no limit
);

The Entrezgene XML entries will be analyzed by a SAX Handler. Each time a textual field is found, its value is appeneded to a buffer that will be broken and digested by the Analyser. We also store the plain values of the ID and a title for each <Entrez-Gene> entry.
if(name.equals("Gene-track_geneid"))
{
this.id= this.content.toString();
}
else if(this.title==null && StringUtils.isIn(name,"Gene-ref_desc","Prot-ref_desc"))
{
this.title= this.content.toString();
}
this.text.append(this.content.toString()).append(" ");


Each time a <Entrezgene> tag is closed, a Document is created.
The value of id the title are saved to this document and the textual content is analysed.

Document document=new Document();
document.add(
new Field(
"id",
this.id,
Field.Store.YES,//Store the original field value in the index.
Field.Index.NOT_ANALYZED //Index the field's value without using an Analyzer, so it can be searched.
)
);
document.add(
new Field(
"title",
(this.title==null?this.id:this.title),
Field.Store.YES,//Store the original field value in the index.
Field.Index.NOT_ANALYZED //Index the field's value without using an Analyzer, so it can be searched.
)
);
document.add(
new Field(
"content",
this.text.toString(),
Field.Store.YES,//Store the original field value in the index.
Field.Index.ANALYZED//Index the tokens produced by running the field's value through an Analyzer.
)
);

A specific 'weight' can be assigned to some documents (default is 1.0). For example here, a weight of 10.0 is set for each document containing the word 'Rotavirus'.
if(this.text.toString().toLowerCase().contains("rotavirus"))
{
document.setBoost(100f);
}

... and the document is saved by the indexer:
this.indexWriter.addDocument(document);

at the end, the indexer is closed.

/* multiple files for each segment are merged into a single file when a new segment is flushed. */
indexWriter.setUseCompoundFile(true);
/* Requests an "optimize" operation on an index, priming the index for the fastest available search. */
indexWriter.optimize();
indexWriter.close();




Output


java -cp lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes -p index gene_result.txt.gz
INFO: indexing genes in /tmp/lucene4genes
Jul 17, 2009 4:14:21 PM org.lindenb.tinytools.Lucene4Genes$GeneHandler endElement
INFO: adding document "eukaryotic translation initiation factor 6"
Jul 17, 2009 4:14:21 PM org.lindenb.tinytools.Lucene4Genes$GeneHandler endElement
INFO: adding document "eukaryotic translation initiation factor 2B, subunit 5 epsilon, 82kDa"
Jul 17, 2009 4:14:21 PM org.lindenb.tinytools.Lucene4Genes$GeneHandler endElement
INFO: adding document "eukaryotic translation initiation factor 2, subunit 1 alpha, 35kDa"
Jul 17, 2009 4:14:21 PM org.lindenb.tinytools.Lucene4Genes$GeneHandler endElement
INFO: adding document "eukaryotic translation initiation factor 2B, subunit 4 delta, 67kDa"
Jul 17, 2009 4:14:21 PM org.lindenb.tinytools.Lucene4Genes$GeneHandler endElement
INFO: adding document "eukaryotic translation initiation factor 5B"
Jul 17, 2009 4:14:21 PM org.lindenb.tinytools.Lucene4Genes$GeneHandler endElement
INFO: adding document "eukaryotic translation initiation factor 4A, isoform 1"
Jul 17, 2009 4:14:21 PM org.lindenb.tinytools.Lucene4Genes$GeneHandler endElement
INFO: adding document "eukaryotic translation initiation factor 2A, 65kDa"
Jul 17, 2009 4:14:21 PM org.lindenb.tinytools.Lucene4Genes$GeneHandler endElement
INFO: adding document "eukaryotic translation initiation factor 4A, isoform 2"
(...)
INFO: adding document "eukaryotic translation initiation factor 2B, subunit 5 epsilon"
Jul 17, 2009 4:14:22 PM org.lindenb.tinytools.Lucene4Genes$GeneHandler endElement
INFO: adding document "mitochondrial translational initiation factor 2"


Querying


Lucene provides a rich query language through the QueryParser.
First an IndexSearcher is created for the current directory.
Directory directory= FSDirectory.getDirectory(this.luceneDir);
IndexSearcher searcher=new IndexSearcher(directory);

The QueryParser translates query expressions into one of Lucene’s built-in query types By default it will search in the "content" attribute of each Document.
QueryParser q=new QueryParser("content", new StandardAnalyzer());

The TopDocCollector will contains the five best results:
TopDocCollector hitCollector = new TopDocCollector(5);

We can now parse, excute the query and loop over the results. Each time a document is found, we print its id, its title and its score.
Query query =q.parse(terms);
searcher.search(
query,
null,//if non-null, used to permit documents to be collected.
hitCollector
);
TopDocs topDocs = hitCollector.topDocs();
if (topDocs!=null && topDocs.totalHits>0)
{
for(ScoreDoc scoredoc:topDocs.scoreDocs)
{
Document document = searcher.doc(scoredoc.doc);
System.out.println(
document.get("id")+"\t"+
document.get("title")+"\t"+
scoredoc.score
);

}
}

Result


Search for alpha
java -cp lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "alpha"
200526 similar to eukaryotic translation initiation factor 2 alpha kinase PEK 0.98296475
201554 similar to eukaryotic translation initiation factor 3, subunit 1 (alpha, 35kD) 0.8425412
340467 similar to Eukaryotic translation initiation factor 3 subunit 1 (eIF-3 alpha) (eIF3 p35) (eIF3j) 0.8425412
203221 similar to eukaryotic translation initiation factor 3, subunit 1 (alpha, 35kD) 0.8425412
82918 similar to eukaryotic translation initiation factor 3, subunit 1 (alpha, 35kD) (H. sapiens) 0.8425412

Search for 1967
java -cp lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "1967"
1967 eukaryotic translation initiation factor 2B, subunit 1 alpha, 26kDa 0.3756647

Search Alpha but NOT subunit
java -cp lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "+alpha -subunit"
200526 similar to eukaryotic translation initiation factor 2 alpha kinase PEK 0.98296475
56478 eukaryotic translation initiation factor 4E nuclear import factor 1 0.24823609
27102 eukaryotic translation initiation factor 2-alpha kinase 1 0.19858888
1983 eukaryotic translation initiation factor 5 0.19858888
5610 eukaryotic translation initiation factor 2-alpha kinase 2 0.17552942

Search for 1967
java -cp lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "1967"
1967 eukaryotic translation initiation factor 2B, subunit 1 alpha, 26kDa 0.3756647

Search for eif4G. The first entry contains the word Rotavirus and we boosted this kind of document, that is why its score is high.
java -cp lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "eif4G"
1981 eukaryotic translation initiation factor 4 gamma, 1 51.464252
1982 eukaryotic translation initiation factor 4 gamma, 2 0.44605052
1973 eukaryotic translation initiation factor 4A, isoform 1 0.42054045
3646 eukaryotic translation initiation factor 3, subunit E 0.26019612
8661 eukaryotic translation initiation factor 3, subunit A 0.22302526

Search for rotavirus AND anyvirus.
java -cp lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "(rotavirus AND anyvirus)"
(empty)

Search for rotavirus OR anyvirus.
java -cp lucene-2.4.1/lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "(rotavirus OR anyvirus)"
1981 eukaryotic translation initiation factor 4 gamma, 1 10.424776

Search for the document having a field id equals to 203221.
java -cp lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "id:203221"
203221 similar to eukaryotic translation initiation factor 3, subunit 1 (alpha, 35kD) 6.0106354

Search for the document having a field id equals to 0.
java -cp lucene-2.4.1/lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "id:00000"
(empty)

Search for the document containing chrom*.
java -cp lucene-2.4.1/lucene-core-2.4.1.jar:build org.lindenb.tinytools.Lucene4Genes \
-p query "chrom*"
83754 eukaryotic translation initiation factor 1A, X chromosome 0.7984222
653994 similar to Eukaryotic translation initiation factor 4H (eIF-4H) (Williams-Beuren syndrome chromosome region 1 protein homolog) 0.5432575
1968 eukaryotic translation initiation factor 2, subunit 3 gamma, 52kDa 0.4981929
3646 eukaryotic translation initiation factor 3, subunit E 0.38104227
54791 argonaute 4 0.30731285


Source code


The source code is also available at Lucene4Genes.java.
package org.lindenb.tinytools;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.logging.Logger;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.lindenb.io.IOUtils;
import org.lindenb.util.Compilation;
import org.lindenb.util.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
* A test for Apache Lucene
*
*/
public class Lucene4Genes
{
private static Logger LOG= Logger.getLogger(Lucene4Genes.class.getName());
private File luceneDir;


/**
* A SAXHandler parsing Entrez Gene and indexing the textual data
* @author pierre
*
*/
private static class GeneHandler
extends DefaultHandler
{
//current value of the tag
private StringBuilder content=null;
//entrez gene id
private String id=null;
//entrez gene title
private String title=null;
//entrez gene concatenated textual data
private StringBuilder text= new StringBuilder();
//lucene indexer
private IndexWriter indexWriter;

GeneHandler(IndexWriter indexWriter)
{
this.indexWriter=indexWriter;
}

@Override
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException
{
this.content=null;
if(StringUtils.isIn(
name,
"Gene-track_geneid",
"Gene-ref_locus",
"Gene-ref_desc",
"Prot-ref_name_E",
"Gene-ref_desc",
"Entrezgene_summary",
"Gene-commentary_text"
))
{
this.content=new StringBuilder();
}

}

@Override
public void endElement(String uri, String localName, String name)
throws SAXException
{
if(name.equals("Entrezgene"))
{
try {
LOG.info("adding document \""+title+"\"");
Document document=new Document();
document.add(
new Field(
"id",
this.id,
Field.Store.YES,//Store the original field value in the index.
Field.Index.NOT_ANALYZED //Index the field's value without using an Analyzer, so it can be searched.
)
);
document.add(
new Field(
"title",
(this.title==null?this.id:this.title),
Field.Store.YES,//Store the original field value in the index.
Field.Index.NOT_ANALYZED //Index the field's value without using an Analyzer, so it can be searched.
)
);
document.add(
new Field(
"content",
this.text.toString(),
Field.Store.YES,//Store the original field value in the index.
Field.Index.ANALYZED//Index the tokens produced by running the field's value through an Analyzer.
)
);
//Sets a boost factor for hits on any field of this document. This value will be multiplied into the score of all hits on this document.
if(this.text.toString().toLowerCase().contains("rotavirus"))
{
document.setBoost(100f);
}
//Adds a document to this index.
this.indexWriter.addDocument(document);

} catch (CorruptIndexException e) {
throw new SAXException(e);
} catch (IOException e) {
throw new SAXException(e);
}
this.id=null;
this.title=null;
this.text= new StringBuilder();
}
else if(this.content!=null)
{
if(name.equals("Gene-track_geneid"))
{
this.id= this.content.toString();
}
else if(this.title==null && StringUtils.isIn(name,"Gene-ref_desc","Prot-ref_desc"))
{
this.title= this.content.toString();
}

this.text.append(this.content.toString()).append(" ");
}
this.content=null;
}

@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if(content!=null)
{
content.append(ch, start, length);
}
}

}

/** Constructor, create the directory if it does not exist */
private Lucene4Genes(File luceneDir)
throws IOException
{
if(!luceneDir.exists())
{
if(!luceneDir.mkdir())
{
throw new IOException("Cannot create "+luceneDir);
}
System.err.println("Created "+luceneDir);
}
if(!luceneDir.isDirectory())
{
throw new IOException("Not a directory "+luceneDir);
}
this.luceneDir=luceneDir;
}

/**
* Index the XML stream containing the entrez genes
* @param in xml stream
* @throws IOException
* @throws SAXException
*/
private void indexGenes(InputStream in) throws IOException,SAXException
{
LOG.info("indexing genes in "+this.luceneDir);
SAXParserFactory f= SAXParserFactory.newInstance();
f.setNamespaceAware(true);
SAXParser parser= null;
try {
parser=f.newSAXParser();
}
catch (ParserConfigurationException err)
{
throw new SAXException(err);
}

/* An Analyzer builds TokenStreams, which analyze text.
* It thus represents a policy for extracting index terms from text.
*/
Analyzer analyzer=new StandardAnalyzer();

/* An IndexWriter creates and maintains an index. */
IndexWriter indexWriter=new IndexWriter(
this.luceneDir,//data dir
analyzer,
true,//create
IndexWriter.MaxFieldLength.UNLIMITED //no limit
);

parser.parse(in, new GeneHandler(indexWriter));

/* multiple files for each segment are merged into a single file when a new segment is flushed. */
indexWriter.setUseCompoundFile(true);
/* Requests an "optimize" operation on an index, priming the index for the fastest available search. */
indexWriter.optimize();
indexWriter.close();
}

/**
* Search our database with the user query, print the result to stdout
* @param terms
* @throws IOException
*/
private void search(String terms) throws IOException
{
Directory directory= FSDirectory.getDirectory(this.luceneDir);
IndexSearcher searcher=new IndexSearcher(directory);
/* QueryParser translates query expressions into one of Lucene’s built-in query types */
QueryParser q=new QueryParser("content", new StandardAnalyzer());
try
{
TopDocCollector hitCollector = new TopDocCollector(5);
Query query =q.parse(terms);
searcher.search(
query,
null,//if non-null, used to permit documents to be collected.
hitCollector
);
TopDocs topDocs = hitCollector.topDocs();

if (topDocs!=null && topDocs.totalHits>0)
{
for(ScoreDoc scoredoc:topDocs.scoreDocs)
{
Document document = searcher.doc(scoredoc.doc);
System.out.println(
document.get("id")+"\t"+
document.get("title")+"\t"+
scoredoc.score
);

}
}
}
catch(ParseException err)
{
throw new IOException(err);
}
}


public static void main(String[] args)
{
Lucene4Genes app=null;
try
{
File dir= new File(System.getProperty("java.io.tmpdir"),"lucene4genes");
String program=null;
int optind=0;
while(optind< args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Lucene for genes. Pierre Lindenbaum PhD (2009).");
System.err.println(Compilation.getLabel());
System.err.println("options:");
System.err.println(" -d <lucene-directory> default:"+dir);
System.err.println(" -p <program>");
System.err.println(" 'index' <stdin|files> index the EntrezGenes input");
System.err.println(" 'query' '<the query>'");
}
else if(args[optind].equals("-d"))
{
dir=new File(args[++optind]);
}
else if(args[optind].equals("-p"))
{
program=args[++optind];
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unknown option "+args[optind]);
}
else
{
break;
}
++optind;
}
if(program==null)
{
System.err.println("Undefined program");
return;
}
app= new Lucene4Genes(dir);
if(program.equals("query"))
{
if(optind+1!=args.length)
{
System.err.println("Illegal number of arguments.");
return;
}
String query= args[optind++];
app.search(query);
}
else if(program.equals("index"))
{

if(optind==args.length)
{
LOG.info("reading stdin");
app.indexGenes(System.in);
}
else
{
while(optind< args.length)
{
String filename=args[optind++];
LOG.info("reading file "+filename);
java.io.InputStream r= IOUtils.openInputStream(filename);
app.indexGenes(r);
r.close();
}
}
}
else
{
System.err.println("Unknown program "+program);
return;
}
}
catch(Throwable err)
{
err.printStackTrace();
}
}
}


That's it !
Pierre