We are building a simplified version of a search engine on top of MongoDB.
Sample data set
{ "_id" : 1, "dept" : "tech", "updDate": ISODate("2014-08-27T09:45:35Z"), "description" : "lime green computer" }
{ "_id" : 2, "dept" : "tech", "updDate": ISODate("2014-07-27T09:45:35Z"), "description" : "wireless red mouse" }
{ "_id" : 3, "dept" : "kitchen", "updDate": ISODate("2014-04-27T09:45:35Z"), "description" : "green placemat" }
{ "_id" : 4, "dept" : "kitchen", "updDate": ISODate("2014-05-27T09:45:35Z"), "description" : "red peeler" }
{ "_id" : 5, "dept" : "food", "updDate": ISODate("2014-04-27T09:45:35Z"), "description" : "green apple" }
{ "_id" : 6, "dept" : "food", "updDate": ISODate("2014-01-27T09:45:35Z"), "description" : "red potato" }
{ "_id" : 7, "dept" : "food", "updDate": ISODate("2014-08-28T09:45:35Z"), "description" : "lime green computer" }
{ "_id" : 8, "dept" : "food", "updDate": ISODate("2014-08-27T09:45:35Z"), "description" : "lime green computer" }
{ "_id" : 9, "dept" : "food", "updDate": ISODate("2014-08-27T09:45:35Z"), "description" : "lime green computer" }
We want to avoid using "offset-limit" to paginate the results, in order to do that we are basically using the "seek method" by modifying the 'where/match' clause of the query in order to be able to use an index instead of iterating over the collection to fetch the desired results. More information about "seek method" can be found here
http://use-the-index-luke.com/blog/2013-07/pagination-done-the-postgresql-way
Search engines typically order results by score and update date in descendant order. To achieve that we are using the text search function in the aggregation pipeline as follows.
db.inventory.createIndex({description:"text", dept: -1, updDate: -1, id:-1})
First page
db.inventory.aggregate( [ { $match: { dept : {$in : ["food","kitchen"]},"$text" : { "$language" : "en", "$search" : "green"} } },{ $project: {score: { $meta: "textScore" }, description : 1, updDate : 1, _id: 1 } }, { $sort: { "score" : -1, "updDate" : -1, _id: -1 } }, {$limit: 2 }] )
{ "_id" : 5, "updDate" : ISODate("2014-04-27T09:45:35Z"), "description" : "green apple", "score" : 0.75 }
{ "_id" : 3, "updDate" : ISODate("2014-04-27T09:45:35Z"), "description" : "green placemat", "score" : 0.75 }
Second page
db.inventory.aggregate( [ { $match: { dept : {$in : ["food","kitchen"]},"$text" : { "$language" : "en", "$search" : "green"} } },{ $project: {score: { $meta: "textScore" }, description : 1, updDate : 1, _id: 1 } }, { $sort: { "score" : -1, "updDate" : -1, _id: -1 } }, { "$match" : { "$or" : [ { "score" : { "$lt" : 0.75}} , { "$and" : [ { "score" : { "$eq" : 0.75}} , { "$or" : [ { "updDate" : { "$lt" : ISODate("2014-04-27T09:45:35Z")}},{ "$and" : [ { "updDate": { "$eq" : ISODate("2014-04-27T09:45:35Z")}} , { "_id" : { "$lt" : 3}}]}]}]}]}},{$limit: 2 }] )
{ "_id" : 7, "updDate" : ISODate("2014-08-28T09:45:35Z"), "description" : "lime green computer", "score" : 0.6666666666666666 }
{ "_id" : 9, "updDate" : ISODate("2014-08-27T09:45:35Z"), "description" : "lime green computer", "score" : 0.6666666666666666 }
And the last page
db.inventory.aggregate( [ { $match: { dept : {$in : ["food","kitchen"]} , "$text" : { "$language" : "en", "$search" : "green"} } }, { $project: {score: { $meta: "textScore" }, description : 1, updDate : 1, _id: 1 } }, { $sort: { "score" : -1, "updDate" : -1, _id: -1 } }, { "$match" : { "$or" : [ { "score" : { "$lt" : 0.6666666666666666}} , { "$and" : [ { "score" : { "$eq" : 0.6666666666666666}} , { "$or" : [ { "updDate" : { "$lt" : ISODate("2014-08-27T09:45:35Z")}} , { "$and" : [ { "updDate" : { "$eq" : ISODate("2014-08-27T09:45:35Z")}} , { "_id" : { "$lt" : 9}}]}]}]}]}}, {$limit: 2 }] )
{ "_id" : 8, "updDate" : ISODate("2014-08-27T09:45:35Z"), "description" : "lime green computer", "score" : 0.6666666666666666 }
Notice how we order the results by score, updDate and id, and in the second match phase how we try to paginate through them using document's score value, update date and finally the id.
As of version 2.6.0, you cannot intersect with geo or text indices and you can intersect at most 2 separate indices with each other. These limitations are likely to change in a future release.
So what is the best index strategy for indexing this collection from the text-search perspective?
One index for the first match phase and another one for the second(pagination) match phase?
db.inventory.createIndex({description:"text", dept: -1})
db.inventory.createIndex({updDate: -1, id:-})
A compound index taking into account fields from both match phases?
db.inventory.createIndex({description:"text", dept: -1, updDate: -1, id:-1})
I've read somewhere that MongoDB is unable to use an index for the second match stage, is it true?
Thanks