04 May 2007

(Trying to use) The NCBI ASN.1 library

Motivation: I was looking for a way to store and to read my linkage data using a binary format, (then I could get a faster and a smaller size).

I had a glance to the NCBI C toolbox which uses ASN.1 as its main format to encode and to structure the data just like XML.

About ASN1: ASN.1 is a standard that describes data structures for representing, encoding, transmitting, and decoding data. It provides a set of formal rules for describing the structure of objects that are independent of machine-specific encoding techniques and is a precise, formal notation that removes ambiguities. Its usage can be compared to the more recent XML Schema (see also my previous post about JAXB).

The NCBI ASN1 C library is descibed here: http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/ASNLIB.HTML

Although I still have a problem in the last part, this post can be considered as my first experience with the ASN.1 library.

I started my test by defining a small ASN.1 module to store some PCR primers and their hits on the genome


PCRMod DEFINITIONS ::=
BEGIN
Orientation ::= ENUMERATED
{
forward(0),
reverse(1)
}

Hit ::= SEQUENCE
{
chromosome VisibleString, -- chromosome
start INTEGER, -- starting position in chromosome 5' + strand
orient Orientation -- orientation
}

Primer ::= SEQUENCE {
name VisibleString, -- name
tm REAL OPTIONAL, -- melting temperature
sequence VisibleString, -- sequence
hits SET OF Hit OPTIONAL -- hits
}

PrimerInput ::=SEQUENCE OF Primer

END


This schema 'primer.asn' was digested by the asntool


asntool -Z T -w 128 -m primer.asn -G T -B primer -K primerasn
asntool -Z T -w 128 -m primer.asn-o primerasn.h


here asntool will read the 'primer.asn' and will generate three source files:

primerasn.h: is a C header containing all states that will be used the parse the ASN1 files.
/***********************************************************************
*
**
* Automatic header module from ASNTOOL
*
************************************************************************/

(...)

static char * asnfilename = "primerasn.h";
static AsnValxNode avnx[2] = {
{20,"forward" ,0,0.0,&avnx[1] } ,
{20,"reverse" ,1,0.0,NULL } };

static AsnType atx[20] = {
{401, "Orientation" ,1,0,0,0,0,0,0,0,NULL,&atx[1],&avnx[0],0,&atx[2]} ,
{310, "ENUMERATED" ,0,10,0,0,0,0,0,0,NULL,NULL,NULL,0,NULL} ,
{402, "Hit" ,1,0,0,0,0,0,0,0,NULL,&atx[8],&atx[3],0,&atx[9]} ,
{0, "chromosome" ,128,0,0,0,0,0,0,0,NULL,&atx[4],NULL,0,&atx[5]} ,
{323, "VisibleString" ,0,26,0,0,0,0,0,0,NULL,NULL,NULL,0,NULL} ,
{0, "start" ,128,1,0,0,0,0,0,0,NULL,&atx[6],NULL,0,&atx[7]} ,
{302, "INTEGER" ,0,2,0,0,0,0,0,0,NULL,NULL,NULL,0,NULL} ,
{0, "orient" ,128,2,0,0,0,0,0,0,NULL,&atx[0],NULL,0,NULL} ,
{311, "SEQUENCE" ,0,16,0,0,0,0,0,0,NULL,NULL,NULL,0,NULL} ,
{403, "Primer" ,1,0,0,0,0,0,0,0,NULL,&atx[8],&atx[10],0,&atx[17]} ,
{0, "name" ,128,0,0,0,0,0,0,0,NULL,&atx[4],NULL,0,&atx[11]} ,
{0, "tm" ,128,1,0,1,0,0,0,0,NULL,&atx[12],NULL,0,&atx[13]} ,
{309, "REAL" ,0,9,0,0,0,0,0,0,NULL,NULL,NULL,0,NULL} ,
{0, "sequence" ,128,2,0,0,0,0,0,0,NULL,&atx[4],NULL,0,&atx[14]} ,
{0, "hits" ,128,3,0,1,0,0,0,0,NULL,&atx[16],&atx[15],0,NULL} ,
{0, NULL,1,-1,0,0,0,0,0,0,NULL,&atx[2],NULL,0,NULL} ,
{314, "SET OF" ,0,17,0,0,0,0,0,0,NULL,NULL,NULL,0,NULL} ,
{404, "PrimerInput" ,1,0,0,0,0,0,0,0,NULL,&atx[19],&atx[18],0,NULL} ,
{0, NULL,1,-1,0,0,0,0,0,0,NULL,&atx[9],NULL,0,NULL} ,
{312, "SEQUENCE OF" ,0,16,0,0,0,0,0,0,NULL,NULL,NULL,0,NULL} };

static AsnModule ampx[1] = {
{ "PCRMod" , "primerasn.h",&atx[0],NULL,NULL,0,0} };

static AsnValxNodePtr avn = avnx;
static AsnTypePtr at = atx;
static AsnModulePtr amp = ampx;



/**************************************************
*
* Defines for Module PCRMod
*
**************************************************/

#define ORIENTATION &at[0]

#define HIT &at[2]
#define HIT_chromosome &at[3]
#define HIT_start &at[5]
#define HIT_orient &at[7]

#define PRIMER &at[9]
#define PRIMER_name &at[10]
#define PRIMER_tm &at[11]
#define PRIMER_sequence &at[13]
#define PRIMER_hits &at[14]
#define PRIMER_hits_E &at[15]

#define PRIMERINPUT &at[17]
#define PRIMERINPUT_E &at[18]




primer.h: contains the C headers used to parse the structure declared in the ASN1 schema. There is a method to allocate/free/read and write each structure.
#
(...)
/**************************************************
*
* Generated objects for Module PCRMod
*
**************************************************/

NLM_EXTERN Boolean LIBCALL
primerAsnLoad PROTO((void));
/* following #defines are for enumerated type, not used by object loaders */
#define Orientation_forward 0
#define Orientation_reverse 1



/**************************************************
* Hit
**************************************************/
typedef struct struct_Hit {
struct struct_Hit PNTR next;
Uint4 OBbits__;
CharPtr chromosome;
Int4 start;
Uint2 orient;
} Hit, PNTR HitPtr;


NLM_EXTERN HitPtr LIBCALL HitFree PROTO ((HitPtr ));
NLM_EXTERN HitPtr LIBCALL HitNew PROTO (( void ));
NLM_EXTERN HitPtr LIBCALL HitAsnRead PROTO (( AsnIoPtr, AsnTypePtr));
NLM_EXTERN Boolean LIBCALL HitAsnWrite PROTO (( HitPtr , AsnIoPtr, AsnTypePtr));



/**************************************************
* Primer
**************************************************/
typedef struct struct_Primer {
struct struct_Primer PNTR next;
Uint4 OBbits__;
CharPtr name;
#define OB__Primer_tm 0

FloatHi tm;
CharPtr sequence;
struct struct_Hit PNTR hits;
} Primer, PNTR PrimerPtr;


NLM_EXTERN PrimerPtr LIBCALL PrimerFree PROTO ((PrimerPtr ));
NLM_EXTERN PrimerPtr LIBCALL PrimerNew PROTO (( void ));
NLM_EXTERN PrimerPtr LIBCALL PrimerAsnRead PROTO (( AsnIoPtr, AsnTypePtr));
NLM_EXTERN Boolean LIBCALL PrimerAsnWrite PROTO (( PrimerPtr , AsnIoPtr, AsnTypePtr));



/**************************************************
* PrimerInput
**************************************************/
typedef struct struct_Primer PrimerInput;
typedef struct struct_Primer PNTR PrimerInputPtr;
#define PrimerInputNew() PrimerNew()



NLM_EXTERN PrimerInputPtr LIBCALL PrimerInputFree PROTO ((PrimerInputPtr ));
NLM_EXTERN PrimerInputPtr LIBCALL PrimerInputNew PROTO (( void ));
NLM_EXTERN PrimerInputPtr LIBCALL PrimerInputAsnRead PROTO (( AsnIoPtr, AsnTypePtr));
NLM_EXTERN Boolean LIBCALL PrimerInputAsnWrite PROTO (( PrimerInputPtr , AsnIoPtr, AsnTypePtr));
(...)


and primer.c the C implementation of those methods.

(...)


OK, here comes the problem: I wrote a simple ASN1 input
PrimerInput::={
{
name "Primer0",
sequence "ATAGCTACTGATGCATGCATCG"
}
}


and I wanted to read each primer. Here is my source:
#include <cerrno>
#include <fstream>
#include <iostream>
#include <string>
#include <stdexcept>
#include <asn.h>
#include <cassert>

/** include the files generated by the ASN1 tool */
#include <primer.h>
#include <primerasn.h>

int main(int argc, char** argv)
{
int optind=1;

/* init my specification */
if(!primerAsnLoad())
{
fprintf(stderr,"#%s: cannot load ASN1 specification \"%s\".\n" ,
argv[0],asnfilename );
return (EXIT_FAILURE);
}

if(optind+1!=argc)
{
fprintf(stderr,"bad input : usage %s ASN1 file\n",argv[0]);
return(EXIT_FAILURE);
}

/** open the input file */
AsnIoPtr in = AsnIoOpen(argv[1],"r");
if(in==NULL)
{
fprintf(stderr,"Cannot Read %s\n",argv[1]);
return(EXIT_FAILURE);
}


/** init the state of the parser */
AsnTypePtr asn_type_ptr=PRIMERINPUT;
PrimerPtr primer= NULL;

/** while we can read a primer... */
while ((asn_type_ptr = AsnReadId(in, amp, asn_type_ptr)) != NULL)
{
if(asn_type_ptr==PRIMERINPUT_E)
{
primer= PrimerAsnRead(in,asn_type_ptr);
if(primer!=NULL)
{
fprintf(stderr,"sequence: %s\n",primer->sequence);
PrimerFree(primer);
}
}
else
{
AsnReadVal(in, asn_type_ptr,NULL);
}
}

AsnIoClose(in);
return(0);
}


The problem: my input file is processed silently but the output shows two primers insted of one and the sequence is said to be NULL.

sequence: (null)
sequence: (null)


I'm blocked here :-)


Pierre

3 comments:

Morgan Langille said...

Interesting, I thought that the ASN1 format was an obscurity that only NCBI used and was slowly dying. Is lack of storage for your data a big problem or is it more of a optimal computer scientist view that is driving your interest?

Pierre Lindenbaum said...

I think that ASN1 is still the main format used by the NCBI although most of their data can be saved as XML. See the 'formatdb' program used by blast: "Although the FASTA format is most often used as input to formatdb, the use of ASN.1 is advantageous for those who are using ASN.1 as the common source for other formats such as the GenBank
report.".

Of course, I known I could save/read my own C struct using fwrite/fread but my test was an occasion to learn more from the NCBI API. See my previous post about "how blast works ?". Understanding the NCBI ASN.1 API can then be used to parse the output without re-inventing the wheel.

Unknown said...

I haven't used the NCBI library for asn.1, but I have used the asn1c compiler. It works very well and can handle DER,XER, and PER encodings. http://lionet.info/asn1c/ . I would take a look at it as its fairly simple and robust.