15 September 2010

MongoDB and NCBI pubmed: Inserting, searching and updating. My notebook.

After Neil and Jan, it is now my turn to play with MongoDB.
In this post, I've inserted some NCBI/PUBMED records into Mongo and tested various queries for selecting and updating the entries.

The initial set was a list of ~300 records where Charles Darwin was cited as a "[PersonalNameSubject]": http://www.ncbi.nlm.nih.gov/sites/entrez?db=pubmed&cmd=search&term=%22Darwin+C%22[ps]. This set was saved as XML and transformed to an input for Mongo using the following XSLT stylesheet:(code available here: http://openwetware.org/wiki/Image:Pubmed2mongo.xsl)

<?xml version="1.0" encoding="UTF-8"?>



<xsl:stylesheet xmlns:xsl='http://www.w3.org/1999/XSL/Transform' version="1.0">

<!--

This stylesheet transforms one or more Pubmed
Article in xml format into JSON for mongodb

Author: Pierre Lindenbaum PhD plindenbaum@yahoo.fr

-->

<xsl:output method="text" encoding="UTF-8"/>


<xsl:template match="/">
<xsl:text>
db.articles.drop();
</xsl:text>
<xsl:apply-templates/>

<xsl:text>
db.articles.ensureIndex({pmid:1}, {unique: true});
db.articles.ensureIndex({created:1});
db.articles.ensureIndex({authors:1});
db.articles.ensureIndex({mesh:1});
db.articles.ensureIndex({journal:1});


</xsl:text>

</xsl:template>

<xsl:template match="PubmedArticleSet">
<xsl:apply-templates select="PubmedArticle"/>
</xsl:template>



<xsl:template match="PubmedArticle">
<xsl:text>
article={
</xsl:text>
<xsl:value-of select="concat('_id:',MedlineCitation/PMID,',')"/>
<xsl:value-of select="concat('pmid:',MedlineCitation/PMID)"/>
<xsl:apply-templates select="MedlineCitation/DateCreated"/>
<xsl:apply-templates select="MedlineCitation/Article/ArticleTitle"/>
<xsl:apply-templates select="MedlineCitation/Article/Journal/JournalIssue/Issue"/>
<xsl:apply-templates select="MedlineCitation/Article/Journal/JournalIssue/Volume"/>
<xsl:apply-templates select="MedlineCitation/Article/Pagination/MedlinePgn"/>
<xsl:apply-templates select="MedlineCitation/Article/Journal"/>
<xsl:apply-templates select="PubmedData/ArticleIdList/ArticleId[@IdType='doi']"/>
<xsl:apply-templates select="PubmedData/ArticleIdList/ArticleId[@IdType='pmc']"/>
<xsl:apply-templates select="MedlineCitation/Article/Language"/>
<xsl:apply-templates select="MedlineCitation/Article/AuthorList"/>
<xsl:apply-templates select="MedlineCitation/MeshHeadingList"/>
<xsl:text>};
article=db.articles.save(article);

</xsl:text>

<xsl:text>

</xsl:text>
</xsl:template>




<xsl:template match="ArticleId[@IdType='doi']">
<xsl:text>,doi:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="ArticleId[@IdType='pmc']">
<xsl:text>,pmc:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="AuthorList">
<xsl:text>,authors:[</xsl:text>
<xsl:for-each select="Author">
<xsl:if test="position()!=1">,</xsl:if>
<xsl:apply-templates select="."/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:template>




<xsl:template match="MeshHeadingList">
<xsl:text>,mesh:[</xsl:text>
<xsl:for-each select="MeshHeading/DescriptorName">
<xsl:if test="position()!=1">,</xsl:if>
<xsl:apply-templates select="." mode="text"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:template>


<xsl:template match="Author">
<xsl:text>{firstName:"</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="ForeName"/>
</xsl:call-template>
<xsl:text>",lastName:"</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="LastName"/>
</xsl:call-template>
<xsl:text>"}</xsl:text>
</xsl:template>




<xsl:template match="DateCreated">
<xsl:text>,created:{</xsl:text>
<xsl:value-of select="concat('year:',number(Year),',')"/>
<xsl:value-of select="concat('month:',number(Month),',')"/>
<xsl:value-of select="concat('day:',number(Day))"/>
<xsl:text>}</xsl:text>
</xsl:template>


<xsl:template match="ArticleTitle">
<xsl:text>,title:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>


<xsl:template match="Issue">
<xsl:text>,issue:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="Volume">
<xsl:text>,volume:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="MedlinePgn">
<xsl:text>,pgn:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>



<xsl:template match="Journal">
<xsl:text>,journal:{title:</xsl:text>
<xsl:apply-templates select="Title" mode="text"/>
<xsl:apply-templates select="ISOAbbreviation"/>
<xsl:apply-templates select="ISSN[@IssnType='Print']"/>
<xsl:text>}</xsl:text>
</xsl:template>

<xsl:template match="ISOAbbreviation">
<xsl:text>,abbr:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="ISSN">
<xsl:text>,issn:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>


<xsl:template match="Language">
<xsl:text>,lang:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="*" mode="text">
<xsl:text>"</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="."/>
</xsl:call-template>
<xsl:text>"</xsl:text>
</xsl:template>

<xsl:template name="escape">
<xsl:param name="s"/>
<xsl:choose>
<xsl:when test="contains($s,'&quot;')">
<xsl:value-of select="substring-before($s,'&quot;')"/>
<xsl:text>\&quot;</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="substring-after($s,'&quot;')"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$s"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>




</xsl:stylesheet>


Transform pubmed/xml to mongo:
xsltproc pubmed2mongo.xsl ~/pubmed_result.txt > input.js

> more input.js

db.articles.drop();

article={_id:20665232,pmid:20665232,created:{year:2010,month:8,day:10},title:"Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth amp; death of species\".",issue:"2",volume:"43",pgn:"363-99",journal:{title:"Journal of the history of biology",abbr:"J Hist Biol",issn:"0022-5010"},doi:"10.1007/s10739-009-9189-9",lang:"eng",authors:[{firstName:"Paul D",lastName:"Brinkman"}],mesh:["Animals","Fossils","History, 19th Century","Natural History","Phylogeny","Vertebrates"]};
article=db.articles.save(article);




article={_id:20626121,pmid:20626121,created:{year:2010,month:7,day:14},title:"[The biomedical legacy of Charles Darwin]",issue:"2",volume:"146",pgn:"87-9",journal:{title:"Gaceta médica de México",abbr:"Gac Med Mex",issn:"0016-3813"},lang:"spa",authors:[{firstName:"Emilio",lastName:"García-Procel"}],mesh:["Biology","Evolution","History, 19th Century","History, 20th Century","Medicine"]};
article=db.articles.save(article);




article={_id:20503821,pmid:20503821,created:{year:2010,month:5,day:27},title:"Darwin and the popularization of evolution.",issue:"1",volume:"64",pgn:"5-24",journal:{title:"Notes and records of the Royal Society of London",abbr:"Notes Rec R Soc Lond",issn:"0035-9149"},lang:"eng",authors:[{firstName:"Bernard",lastName:"Lightman"}],mesh:["Biology","Evolution","Genetic Fitness","History, 19th Century","History, 20th Century","Humans","Male","Philosophy","Religion","Science","Selection, Genetic","United States"]};
article=db.articles.save(article);

and load this file into mongo in the database 'pubmed'.
mongo pubmed input.js


and add a few indexes for this database:
>db.articles.ensureIndex({pmid:1}, {unique: true});
db.articles.ensureIndex({created:1});
db.articles.ensureIndex({authors:1});
db.articles.ensureIndex({mesh:1});
db.articles.ensureIndex({journal:1});


print 3 records

> db.articles.find().limit(3).forEach(printjson);
{
"_id" : 20665232,
"pmid" : 20665232,
"created" : {
"year" : 2010,
"month" : 8,
"day" : 10
},
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\".",
"issue" : "2",
"volume" : "43",
"pgn" : "363-99",
"journal" : {
"title" : "Journal of the history of biology",
"abbr" : "J Hist Biol",
"issn" : "0022-5010"
},
"doi" : "10.1007/s10739-009-9189-9",
"lang" : "eng",
"authors" : [
{
"firstName" : "Paul D",
"lastName" : "Brinkman"
}
],
"mesh" : [
"Animals",
"Fossils",
"History, 19th Century",
"Natural History",
"Phylogeny",
"Vertebrates"
]
}
{
"_id" : 20626121,
"pmid" : 20626121,
"created" : {
"year" : 2010,
"month" : 7,
"day" : 14
},
"title" : "[The biomedical legacy of Charles Darwin]",
"issue" : "2",
"volume" : "146",
"pgn" : "87-9",
"journal" : {
"title" : "Gaceta médica de México",
"abbr" : "Gac Med Mex",
"issn" : "0016-3813"
},
"lang" : "spa",
"authors" : [
{
"firstName" : "Emilio",
"lastName" : "García-Procel"
}
],
"mesh" : [
"Biology",
"Evolution",
"History, 19th Century",
"History, 20th Century",
"Medicine"
]
}
{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}

skip 2 and print 1 record

> db.articles.find().skip(2).limit(1).forEach(printjson);
{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}

How many records in the database, regardless of the limit ?

> db.articles.find().limit(20).count();
327

How many records will be printed ?

> db.articles.find().limit(20).size();
20

Find one record with PMID=-1

> db.articles.findOne({pmid:-1});
null

Find PMID 20180452

> db.articles.findOne({pmid:20180452});
{
"_id" : 20180452,
"pmid" : 20180452,
"created" : {
"year" : 2010,
"month" : 2,
"day" : 25
},
"title" : "[Darwin's hidden feeling for emotions of the species]",
"issue" : "50-51",
"volume" : "106",
"pgn" : "3443-6",
"journal" : {
"title" : "Läkartidningen",
"abbr" : "Lakartidningen",
"issn" : "0023-7205"
},
"lang" : "swe",
"authors" : [
{
"firstName" : "Gösta",
"lastName" : "Alfvén"
}
],
"mesh" : [
"Animals",
"Emotions",
"England",
"Evolution",
"Famous Persons",
"History, 19th Century",
"Humans",
"Species Specificity"
]
}

Searching in Arrays: find records having a mesh equals to 'Lactose Intolerance'

> db.articles.find({mesh:'Lactose Intolerance'}).forEach(printjson);
{
"_id" : 17575947,
"pmid" : 17575947,
"created" : {
"year" : 2007,
"month" : 6,
"day" : 19
},
"title" : "Darwin's illness: a final diagnosis.",
"issue" : "1",
"volume" : "61",
"pgn" : "23-9",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Fernando",
"lastName" : "Orrego"
},
{
"firstName" : "Carlos",
"lastName" : "Quintana"
}
],
"mesh" : [
"Biology",
"Crohn Disease",
"England",
"History, 19th Century",
"Humans",
"Lactose Intolerance",
"Male"
]
}
{
"_id" : 15811889,
"pmid" : 15811889,
"created" : {
"year" : 2005,
"month" : 4,
"day" : 6
},
"title" : "Darwin's illness revealed.",
"issue" : "954",
"volume" : "81",
"pgn" : "248-51",
"journal" : {
"title" : "Postgraduate medical journal",
"abbr" : "Postgrad Med J",
"issn" : "0032-5473"
},
"doi" : "10.1136/pgmj.2004.025569",
"pmc" : "PMC1743237",
"lang" : "eng",
"authors" : [
{
"firstName" : "Anthony K",
"lastName" : "Campbell"
},
{
"firstName" : "Stephanie B",
"lastName" : "Matthews"
}
],
"mesh" : [
"Famous Persons",
"Great Britain",
"History, 19th Century",
"Lactose Intolerance"
]
}

Explain, find with an index (PMID)

> db.articles.find({pmid:20180452}).explain();
{
"cursor" : "BtreeCursor pmid_1",
"nscanned" : 1,
"nscannedObjects" : 1,
"n" : 1,
"millis" : 0,
"indexBounds" : {
"pmid" : [
[
20180452,
20180452
]
]
}
}

Explain, find without an index (volume)

> db.articles.find({volume:'81'}).explain();
{
"cursor" : "BasicCursor",
"nscanned" : 327,
"nscannedObjects" : 327,
"n" : 2,
"millis" : 0,
"indexBounds" : {

}
}

'AND operator' search for records having mesh and 'Evolution' AND 'Religion' AND 'History, 19th Century'

>db.articles.find({mesh:{$all:["Evolution","History, 19th Century","Religion"]}}).limit(2).forEach(printjson);

{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}
{
"_id" : 20145235,
"pmid" : 20145235,
"created" : {
"year" : 2010,
"month" : 2,
"day" : 10
},
"title" : "Darwin's compassionate view of human nature.",
"issue" : "6",
"volume" : "303",
"pgn" : "557-8",
"journal" : {
"title" : "JAMA : the journal of the American Medical Association",
"abbr" : "JAMA"
},
"doi" : "10.1001/jama.2010.101",
"lang" : "eng",
"authors" : [
{
"firstName" : "Paul",
"lastName" : "Ekman"
}
],
"mesh" : [
"Altruism",
"Animals",
"Behavior, Animal",
"Empathy",
"Evolution",
"Famous Persons",
"History, 19th Century",
"Humans",
"Religion",
"Social Values"
]
}

Search all, only return the name and the pmid, limit 5

> db.articles.find({},{"title":1,"pmid":1}).limit(5).forEach(printjson);
{
"_id" : 20665232,
"pmid" : 20665232,
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\"."
}
{
"_id" : 20626121,
"pmid" : 20626121,
"title" : "[The biomedical legacy of Charles Darwin]"
}
{
"_id" : 20503821,
"pmid" : 20503821,
"title" : "Darwin and the popularization of evolution."
}
{
"_id" : 20481191,
"pmid" : 20481191,
"title" : "Between the Beagle and the barnacle: Darwin's microscopy, 1837-1854."
}
{
"_id" : 20338536,
"pmid" : 20338536,
"title" : "Darwin as a student of behavior."
}

Search all, only returns the name and the date, sort by year, limit 5

> db.articles.find({},{"title":1,"created":1}).limit(5).sort({"created.year":1}).forEach(printjson);

{
"_id" : 20255988,
"created" : {
"year" : 1947,
"month" : 12,
"day" : 1
},
"title" : "Charles Darwin's life at Downe."
}
{
"_id" : 14840951,
"created" : {
"year" : 1951,
"month" : 12,
"day" : 1
},
"title" : "Some letters from Charles Darwin to Jeffries Wyman."
}
{
"_id" : 13110175,
"created" : {
"year" : 1954,
"month" : 12,
"day" : 1
},
"title" : "The life of the shawl."
}
{
"_id" : 13457284,
"created" : {
"year" : 1957,
"month" : 12,
"day" : 1
},
"title" : "[Charles Darwin; 1809-1882.]"
}
{
"_id" : 13377935,
"created" : {
"year" : 1957,
"month" : 12,
"day" : 1
},
"title" : "[Darwin and Freud; on Sigmund Freud's centenary.]"
}

Search all, omit some fields, limit 2

> db.articles.find({},{title:0,pmid:0,authors:0,created:0,mesh:0,journal:0}).limit(2).forEach(printjson);
{
"_id" : 20665232,
"issue" : "2",
"volume" : "43",
"pgn" : "363-99",
"doi" : "10.1007/s10739-009-9189-9",
"lang" : "eng"
}
{
"_id" : 20626121,
"issue" : "2",
"volume" : "146",
"pgn" : "87-9",
"lang" : "spa"
}

Sub documents: Search articles published in the 'Lancet',limit 2

> db.articles.find({"journal.title":"Lancet"}).limit(2).forEach(printjson);
{
"_id" : 19205083,
"pmid" : 19205083,
"created" : {
"year" : 2009,
"month" : 2,
"day" : 9
},
"title" : "What Darwin learned in medical school.",
"issue" : "9662",
"volume" : "373",
"pgn" : "454-5",
"journal" : {
"title" : "Lancet",
"abbr" : "Lancet"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Niles",
"lastName" : "Eldredge"
}
],
"mesh" : [
"Education, Medical",
"Educational Status",
"Evolution",
"History, 19th Century",
"Natural History",
"Schools, Medical",
"Scotland"
]
}
{
"_id" : 11597663,
"pmid" : 11597663,
"created" : {
"year" : 2001,
"month" : 10,
"day" : 12
},
"title" : "Darwin the philosopher?",
"issue" : "9288",
"volume" : "358",
"pgn" : "1118",
"journal" : {
"title" : "Lancet",
"abbr" : "Lancet",
"issn" : "0140-6736"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "J",
"lastName" : "Radcliffe Richards"
}
],
"mesh" : [
"History, 19th Century",
"Philosophy"
]
}

Only print the first author of each article

> db.articles.find({},{"authors":{$slice:1}}).limit(3).forEach(printjson);
{
"_id" : 20665232,
"pmid" : 20665232,
"created" : {
"year" : 2010,
"month" : 8,
"day" : 10
},
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\".",
"issue" : "2",
"volume" : "43",
"pgn" : "363-99",
"journal" : {
"title" : "Journal of the history of biology",
"abbr" : "J Hist Biol",
"issn" : "0022-5010"
},
"doi" : "10.1007/s10739-009-9189-9",
"lang" : "eng",
"authors" : [
{
"firstName" : "Paul D",
"lastName" : "Brinkman"
}
],
"mesh" : [
"Animals",
"Fossils",
"History, 19th Century",
"Natural History",
"Phylogeny",
"Vertebrates"
]
}
{
"_id" : 20626121,
"pmid" : 20626121,
"created" : {
"year" : 2010,
"month" : 7,
"day" : 14
},
"title" : "[The biomedical legacy of Charles Darwin]",
"issue" : "2",
"volume" : "146",
"pgn" : "87-9",
"journal" : {
"title" : "Gaceta médica de México",
"abbr" : "Gac Med Mex",
"issn" : "0016-3813"
},
"lang" : "spa",
"authors" : [
{
"firstName" : "Emilio",
"lastName" : "García-Procel"
}
],
"mesh" : [
"Biology",
"Evolution",
"History, 19th Century",
"History, 20th Century",
"Medicine"
]
}
{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}

Only print the last author of each article

> db.articles.find({},{"authors":{$slice:-1}}).limit(3).forEach(printjson);
{
"_id" : 20665232,
"pmid" : 20665232,
"created" : {
"year" : 2010,
"month" : 8,
"day" : 10
},
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\".",
"issue" : "2",
"volume" : "43",
"pgn" : "363-99",
"journal" : {
"title" : "Journal of the history of biology",
"abbr" : "J Hist Biol",
"issn" : "0022-5010"
},
"doi" : "10.1007/s10739-009-9189-9",
"lang" : "eng",
"authors" : [
{
"firstName" : "Paul D",
"lastName" : "Brinkman"
}
],
"mesh" : [
"Animals",
"Fossils",
"History, 19th Century",
"Natural History",
"Phylogeny",
"Vertebrates"
]
}
{
"_id" : 20626121,
"pmid" : 20626121,
"created" : {
"year" : 2010,
"month" : 7,
"day" : 14
},
"title" : "[The biomedical legacy of Charles Darwin]",
"issue" : "2",
"volume" : "146",
"pgn" : "87-9",
"journal" : {
"title" : "Gaceta médica de México",
"abbr" : "Gac Med Mex",
"issn" : "0016-3813"
},
"lang" : "spa",
"authors" : [
{
"firstName" : "Emilio",
"lastName" : "García-Procel"
}
],
"mesh" : [
"Biology",
"Evolution",
"History, 19th Century",
"History, 20th Century",
"Medicine"
]
}
{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}

comparators: Print the title and the date for the articles published between February and March 2009.

> db.articles.find({"created.year":2009,"created.month":{$lt:4,$gt:1} },{title:1,created:1}).limit(2).forEach(printjson);

{
"_id" : 19283711,
"created" : {
"year" : 2009,
"month" : 3,
"day" : 17
},
"title" : "The day of immunology 2009."
}
{
"_id" : 19258529,
"created" : {
"year" : 2009,
"month" : 3,
"day" : 4
},
"title" : "MMBR to highlight microbial evolution, diversity, and ecology in 2009."
}

Print the title and the meshs for the articles NOT having ('Evolution' and 'Animals') in the mesh list.

> db.articles.find({mesh:{$nin:["Evolution","Animals"]} },{title:1,mesh:1}).limit(2).forEach(printjson);

{
"_id" : 20338529,
"title" : "The Darwin of pangenesis.",
"mesh" : [
"Cell Biology",
"History, 19th Century",
"Natural History",
"Physiology",
"Reproduction",
"Selection, Genetic"
]
}
{
"_id" : 20338527,
"title" : "Cross- and self-fertilization of plants.",
"mesh" : [
"Bibliography as Topic",
"Botany",
"Correspondence as Topic",
"Crosses, Genetic",
"Fertilization",
"Flowers",
"History, 19th Century",
"Orchidaceae",
"Plant Physiological Phenomena",
"Pollination"
]
}

Print the title and the authors for the articles having two authors, limit 2.

>db.articles.find({authors:{$size:2}} ,{title:1,authors:1}).limit(2).forEach(printjson);

{
"_id" : 20338526,
"title" : "Minute observations and theoretical framework of Darwin's studies on climbing plants.",
"authors" : [
{
"firstName" : "Jean-Marc",
"lastName" : "Drouin"
},
{
"firstName" : "Thierry",
"lastName" : "Deroin"
}
]
}
{
"_id" : 20338522,
"title" : "A non-Darwinian Darwin: An introduction.",
"authors" : [
{
"firstName" : "Jean",
"lastName" : "Gayon"
},
{
"firstName" : "Michel",
"lastName" : "Veuille"
}
]
}

Search for articles having a pmc-id.

> db.articles.find({pmc:{$exists:true}} ,{title:1,pmc:1}).limit(2).forEach(printjson);
{
"_id" : 19884139,
"title" : "Darwin's contributions to our understanding of emotional expressions.",
"pmc" : "PMC2781895"
}
{
"_id" : 19258529,
"title" : "MMBR to highlight microbial evolution, diversity, and ecology in 2009.",
"pmc" : "PMC2650884"
}

Search for articles lacking a pmc-id.

>db.articles.find({pmc:{$exists:false}} ,{title:1,pmc:1}).limit(2).forEach(printjson);
{
"_id" : 20665232,
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\"."
}
{ "_id" : 20626121, "title" : "[The biomedical legacy of Charles Darwin]" }

Number of article where the title is not a string


> db.articles.find({title:{$not:{$type:2}}}).count();

0

OR operator:Articles published in 'Nature' or 'Lancet',limit 3

> db.articles.find({$or:[{"journal.title":"Lancet"},{"journal.title":"Nature"}]},{title:1,"journal.title":1}).limit(3).forEach(printjson);

{
"_id" : 19242459,
"title" : "Q&A: Getting under Darwin's skin. Interview by Adam Rutherford.",
"journal" : {
"title" : "Nature"
}
}
{
"_id" : 19205083,
"title" : "What Darwin learned in medical school.",
"journal" : {
"title" : "Lancet"
}
}
{
"_id" : 19020602,
"title" : "Birthdays to remember.",
"journal" : {
"title" : "Nature"
}
}

AND operator:Number of articles published in 'Nature' AND 'Lancet',limit 2

> db.articles.find({$and:[{"journal.title":"Lancet"},{"journal.title":"Nature"}]}).count();
0

REGEX operator:Search articles having a word starting with 'darwin'

> db.articles.find({title:/darwin[a-z]+/i },{title:1}).limit(5).forEach(printjson);

{
"_id" : 20338530,
"title" : "Sexual selection: Another Darwinian process."
}
{ "_id" : 20338522, "title" : "A non-Darwinian Darwin: An introduction." }
{
"_id" : 19784612,
"title" : "The predictability of evolution: glimpses into a post-Darwinian world."
}
{
"_id" : 19213802,
"title" : "Darwinian evolution in the light of genomics."
}
{
"_id" : 19203139,
"title" : "Darwinian theory, functionalism, and the first American psychological revolution."
}

Using javascript $where: articles starting with 'DARWIN'

> db.articles.find({$where:"this.title.substr(0,6)==\"DARWIN\""},{title:1}).limit(5).forEach(printjson);

{
"_id" : 14341734,
"title" : "DARWIN AS THE SOURCE OF FREUD'S NEO-LAMARCKIANISM."
}
{ "_id" : 14275525, "title" : "DARWIN'S ILLNESS." }
{
"_id" : 14248443,
"title" : "DARWIN'S HEALTH IN RELATION TO HIS VOYAGE TO SOUTH AMERICA."
}
{ "_id" : 14217140, "title" : "DARWIN'S ILLNESS." }

Array element by position: First author is Darwin

> db.articles.find({"authors.0.lastName":"Darwin"},{authors:1,title:1}).forEach(printjson);

{
"_id" : 11640659,
"title" : "[Not Available]",
"authors" : [
{
"firstName" : "C",
"lastName" : "Darwin"
}
]
}

Min/Max operator with indexed properties. Print PMID between [797714,382974[


>db.articles.find({},{pmid:1}).max({pmid:797714}).min({pmid:382974}).forEach(printjson);


{ "_id" : 382974, "pmid" : 382974 }
{ "_id" : 385078, "pmid" : 385078 }
{ "_id" : 387661, "pmid" : 387661 }
{ "_id" : 392346, "pmid" : 392346 }
{ "_id" : 395189, "pmid" : 395189 }
{ "_id" : 797709, "pmid" : 797709 }
{ "_id" : 797713, "pmid" : 797713 }

Map reduce: build a database containing all the distinct authors' names

> db.eval( function()
{
db.articles.find().forEach(function(o)
{
for(i in o.authors)
{
var s= o.authors[i].lastName;
db.names.save({name:s, _id:s});
}
}
);});
> db.names.find().limit(10).forEach(printjson);



{ "_id" : "Hodge", "name" : "Hodge" }
{ "_id" : "Deutsch", "name" : "Deutsch" }
{ "_id" : "Bellini", "name" : "Bellini" }
{ "_id" : "Torgerson", "name" : "Torgerson" }
{ "_id" : "Brinkman", "name" : "Brinkman" }
{ "_id" : "Alfvén", "name" : "Alfvén" }
{ "_id" : "Derry", "name" : "Derry" }
{ "_id" : "Fara", "name" : "Fara" }
{ "_id" : "Eldredge", "name" : "Eldredge" }
{ "_id" : "Buss", "name" : "Buss" }

Distinct operator: get all the distinct publication year

> db.articles.distinct("created.year")

[
1947,
1951,
1954,
1957,
1959,
1960,
1963,
1965,
1967,
1969,
1970,
1971,
1972,
1973,
1974,
1975,
1977,
1978,
1979,
1980,
1981,
1982,
1983,
1984,
1985,
1986,
1987,
1989,
1990,
1991,
1992,
1993,
1994,
1995,
1996,
1997,
1998,
1999,
2000,
2001,
2002,
2003,
2004,
2005,
2006,
2007,
2008,
2009,
2010
]

Distinct operator: get all the distinct publication years for the Journal 'Nature'

> db.articles.distinct("created.year",{"journal.title":"Nature"})

[ 1969, 1982, 1983, 2001, 2004, 2005, 2007, 2008, 2009 ]

GROUP operator: the number of articles per journal having mesh='Evolution' and having a number of articles greater than 2

> db.articles.group(
{
key:{},
cond:{mesh:"Evolution"},
initial:{journal:{},total:0},
reduce: function(object, aggregate)
{
var count=aggregate.journal[object.journal.title];
if(!count)
{
count=0;
}
count++;
aggregate.journal[object.journal.title]=count;
aggregate.total++;
},
finalize:function(aggregate)
{
for(j in aggregate.journal)
{
if( aggregate.journal[j]<3)
{
delete aggregate.journal[j];
}
}
}
})



[
{
"journal" : {
"Comptes rendus biologies" : 5,
"Läkartidningen" : 6,
"Isis; an international review devoted to the history of science and its cultural influences" : 6,
"Tidsskrift for den Norske lægeforening : tidsskrift for praktisk medicin, ny række" : 3,
"Endeavour" : 3,
"Journal of the history of biology" : 9,
"Die Naturwissenschaften" : 4,
"The American psychologist" : 7,
"Scientific American" : 3,
"Current biology : CB" : 5,
"Science (New York, N.Y.)" : 3,
"History and philosophy of the life sciences" : 8,
"Nature" : 10,
"Studies in history and philosophy of biological and biomedical sciences" : 5,
"Journal of the history of the behavioral sciences" : 5,
"Singapore medical journal" : 4,
"Rivista di biologia" : 4,
"Annals of science" : 3,
"Journal of the history of medicine and allied sciences" : 4
},
"total" : 207
}
]

Remove some fields for the journal published in Nature

> db.articles.update({"journal.title":"Nature"},{$unset:{title:1,authors:1,created:1,mesh:1}},false,true)
> db.articles.find({"journal.title":"Nature"}).limit(2).forEach(printjson);



{
"_id" : 19242459,
"doi" : "10.1038/4571087b",
"issue" : "7233",
"journal" : {
"title" : "Nature",
"abbr" : "Nature"
},
"lang" : "eng",
"pgn" : "1087",
"pmid" : 19242459,
"volume" : "457"
}
{
"_id" : 19020602,
"doi" : "10.1038/456324a",
"issue" : "7220",
"journal" : {
"title" : "Nature",
"abbr" : "Nature"
},
"lang" : "eng",
"pgn" : "324-5",
"pmid" : 19020602,
"volume" : "456"
}

Add another author for PMID:382974

> db.articles.update({"pmid":382974},{$push:{authors:{firstName:"John",lastName:"Doe"}}},false,true)
> db.articles.find({"pmid":382974},{authors:1}).forEach(printjson)


{
"_id" : 382974,
"authors" : [
{
"firstName" : "N",
"lastName" : "Freire-Maia"
},
{
"firstName" : "John",
"lastName" : "Doe"
}
]
}

Remove first author for PMID:382974

> db.articles.update({pmid:382974},{$pop:{authors:-1}},false,true)
> db.articles.find({pmid:382974},{authors:1}).forEach(printjson);


{
"_id" : 382974,
"authors" : [
{
"firstName" : "John",
"lastName" : "Doe"
}
]
}

Insert comment for PMID:382974

> db.articles.update({pmid:382974},{$set:{comment:"this is my comment"}})
> db.articles.find({pmid:382974},{comment:1}).forEach(printjson);


{ "_id" : 382974, "comment" : "this is my comment" }

add 2000000 to the year of publication for PMID:382974

> db.articles.update({pmid:382974},{$inc:{"created.year":2000000}},false)
> db.articles.find({pmid:382974},{created:1}).forEach(printjson);


{
"_id" : 382974,
"created" : {
"day" : 26,
"month" : 10,
"year" : 2001979
}
}


That's it.

Pierre

No comments: