Advanced mongo aggregation -: Conditionally project in aggregation with array filtering without elemMatch?

2,373 views
Skip to first unread message

Pratik Bothra

unread,
Mar 14, 2016, 5:08:32 PM3/14/16
to mongodb-user
It turns out $project does not support $elemMatch in aggregation. In 3.2, they introduced filter etc, which doesn't seem to solve my problem. 

Let me explain what I'm trying to do, suppose I've the following documents in the database.

    db.test.insert(
   
{
     
"ad_account_id": 150,
     
"internal_id": 1,
     
"daily": [{
       
"timestamp": "2016-12-01",
       
"impressions": 5
     
}, {
       
"timestamp": "2016-12-06",
       
"impressions": 7
     
}]
   
})
   
    db
.test.insert(
   
{
     
"ad_account_id": 150,
     
"internal_id": 2,
     
"daily": [{
       
"timestamp": "2016-12-03",
       
"impressions": 6
     
}]
   
})
   
    db
.test.insert({
     
"ad_account_id": 150,
     
"internal_id": 3,
     
"daily": []
   
})
   
   
    db
.test.insert({
     
"ad_account_id": 16,
     
"internal_id": 3,
     
"daily": []
   
})


Now suppose a user queries for ad_account_id: 150, and filters by start and end date range as "2016-12-01" to "2016-12-02".

My aggregation query reads like this (skipped sort, limit etc)

    db.getCollection('test').aggregate({
           
"$match" : {
             
"ad_account_id" : 150,
             
"daily" : {
               
"$elemMatch" : {
                 
"timestamp" : {
                   
"$lte" : "2016-12-02",
                   
"$gte" : "2016-12-01"
                 
}
               
}
             
}
           
}
         
},
         
{
           
"$unwind" : "$daily"
         
},
         
{
           
"$match" : {
             
"daily.timestamp" : {
               
"$lte" : "2016-12-02",
               
"$gte" : "2016-12-01"
             
}
           
}
         
},
         
{
           
"$group" : {
             
"impressions" : {
               
"$sum" : "$daily.impressions"
             
},
             
"ad_account_id" : {
               
"$first" : "$ad_account_id"
             
},
             
"_id" : "$internal_id"
           
}
         
},
         
{
           
"$project" : {
             
"impressions" : 1,
             
"ga_transactions" : 1,
             
"ad_account_id" : 1
           
}
         
}
   
);


**Current Result** 

    { "_id" : 1, "impressions" : 5, "ad_account_id" : 150 }


In our local development, it initially seemed okay. The query was fast even with a million documents, and we were happy. 

But we soon realized our use case, where we needed to show rows even if daily data was not between the start date and end date. Impressions etc in it could be represented by 0 but they had to be shown for sure.

So the **desired result** which we wanted was this


     
{ "_id" : 1, "impressions" : 5, "ad_account_id" : 150 }
     
{ "_id" : 2, "impressions" : 0, "ad_account_id" : 150 }
     
{ "_id" : 3, "impressions" : 0, "ad_account_id" : 150 }



And I have been struggling with this for the last few hours as I can't seem to get this in a single mongo query. I thought I would limit my match to just ad account id, and then do a $project, and if no data is there between that data range, I would just add a sample entry to daily with the start data as the timestamp something like this. 

    {
     
"ad_account_id": 150,
     
"internal_id": 3,
     
"daily": [{timestamp: "2016-02-01"}]
   
)


But unfortunately I can't get this to work, as within $project you can't get do $elemMatch. The new things like $filter etc don't seem to solve my problem. 

I also tried union, and I think its almost there as well


    db
.getCollection('test').aggregate(
       
{
           
"$match" : {
             
"ad_account_id" : 150,
             
"daily" : {
               
"$elemMatch" : {
                 
"timestamp" : {
                   
"$lte" : "2016-12-02",
                   
"$gte" : "2016-12-01"
                 
}
               
}
             
}
           
}
         
},
         
{ "$project": {
           
"ad_account_id": 1,
           
"daily": {
               
"$setUnion": [
                   
{ "$map": {
                       
"input": "$daily",
                       
"as": "day",
                       
"in": {
                           
"$cond": [
                               
{ "$and": [
                                   
{ "$gte": [ "$day.timestamp", "2016-12-01" ] },
                                   
{ "$lte": [ "$day.timestamp", "2016-12-02" ] }
                               
]},
                               
"$day",
                               
false
                           
]
                       
}
                   
}},
                   
[{"timestamp": "2016-12-01"}]
               
]
           
}
         
}},
         
{
           
"$unwind" : "$daily"
         
},
         
{
           
"$match" : {
             
"daily.timestamp" : {
               
"$lte" : "2016-12-02",
               
"$gte" : "2016-12-01"
             
}
           
}
         
},
         
{
           
"$group" : {
             
"impressions" : {
               
"$sum" : "$daily.impressions"
             
},
             
"ad_account_id" : {
                 
"$first" : "$ad_account_id"
               
},
             
"_id" : "$internal_id"
           
}
         
},
         
{
           
"$project" : {
             
"impressions" : 1,
             
"ga_transactions" : 1,
             
"ad_account_id" : 1
           
}
         
}
   
}


But this gave me an error ""FieldPath '2016-12-01' doesn't start with $". Curious that by what all different techniques could we handle this? 

What do you think is the best way to do this? Btw stackoverflow link for the same -: http://stackoverflow.com/questions/35992817/conditionally-project-in-aggregation-with-array-filtering-without-elemmatch

Pratik Bothra

unread,
Mar 15, 2016, 1:57:32 AM3/15/16
to mongodb-user
I figured out the solution

Okay, spent literally hours on this, and had an eureka moment. Turns out I wasn't too far from the solution.

    db.getCollection('test').aggregate(
       
{
           
"$match" : {
             
"ad_account_id" : 150
           
}
         
},

         
{ "$project": {
           
"ad_account_id": 1,
           
"internal_id": 1,

           
"daily": {
               
"$setUnion": [
                   
{ "$map": {
                       
"input": "$daily",
                       
"as": "day",
                       
"in": {
                           
"$cond": [
                               
{ "$and": [

                                   
{ "$gte": [ "$$day.timestamp", "2016-12-01" ] },
                                   
{ "$lte": [ "$$day.timestamp", "2016-12-02" ] }
                               
]},
                               
"$$day",
                               
false
                           
]
                       
}
                   
}},
                   
[{"$literal": {"timestamp": "2016-12-01" } }]
               
]
           
}
         
}},
         
{
           
"$unwind" : "$daily"
         
},
         
{

           
"$group" : {
             
"impressions" : {
               
"$sum" : "$daily.impressions"
             
},
             
"ad_account_id" : {

   
...          "$first" : "$ad_account_id"
   
...        },

             
"_id" : "$internal_id"
           
}
         
},
         
{
           
"$project" : {
             
"impressions" : 1,

             
"ad_account_id" : 1
           
}
         
}
   
);


For people looking at this for ideas, I added `"daily_mod": { $addToSet: "$daily" }` to the last $group stage, and added this to the last project  `"daily_mod": 1`.

This will really help you understand what happened and gives an output of -: 

    { "_id" : 3, "impressions" : 0, "ad_account_id" : 150, "daily_mod" : [ { "timestamp" : "2016-12-01" } ] }
   
{ "_id" : 2, "impressions" : 0, "ad_account_id" : 150, "daily_mod" : [ false, { "timestamp" : "2016-12-01" } ] }
   
{ "_id" : 1, "impressions" : 5, "ad_account_id" : 150, "daily_mod" : [ { "timestamp" : "2016-12-01", "impressions" : 5 }, false, { "timestamp" : "2016-12-01" } ] }


If someone can give me a better answer in regards to performance, that would be great. 
...
Reply all
Reply to author
Forward
0 new messages