http://docs.mongodb.org/manual/core/map-reduce/
http://docs.mongodb.org/manual/reference/command/mapReduce/
> db.lattern_money_record.mapReduce( function() { emit(this.quantity, 1) }, function(key, values) { return Array.sum(values) }, { query: {'quantity': {$gt: 500}}, out: {inline: 1} } )
{
"results" : [
{
"_id" : 550,
"value" : 3
},
{
"_id" : 570,
"value" : 1
},
{
"_id" : 580,
"value" : 1
},
{
"_id" : 583,
"value" : 1
},
{
"_id" : 587,
"value" : 1
},
{
"_id" : 600,
"value" : 2
},
{
"_id" : 660,
"value" : 1
},
{
"_id" : 700,
"value" : 2
},
{
"_id" : 800,
"value" : 5
},
{
"_id" : 900,
"value" : 2
},
{
"_id" : 924,
"value" : 1
},
{
"_id" : 949,
"value" : 1
},
{
"_id" : 980,
"value" : 1
},
{
"_id" : 990,
"value" : 1
},
{
"_id" : 1000,
"value" : 12
}
],
"timeMillis" : 36,
"counts" : {
"input" : 35,
"emit" : 35,
"reduce" : 6,
"output" : 15
},
"ok" : 1,
}
The MapReduce code I used to analyze the 20 million hotel reservation records:
def get_aggregation(collection):
'''
1. Get unique set of people
2. Get most frequent users
3. Get aggregation by location of birth, age, month and day of birth
'''
# Emit multiple times in mapper function:
# http://docs.mongodb.org/manual/reference/command/mapReduce/
mapper = Code('''
function() {
function validate_rid(id) {
// From: https://gist.github.com/foxwoods/1817822
// 18位身份证号
// 国家标准《GB 11643-1999》
function rid18(id) {
if(! /\d{17}[\dxX]/.test(id)) {
return false;
}
var modcmpl = function(m, i, n) { return (i + n - m % i) % i; },
f = function(v, i) { return v * (Math.pow(2, i-1) % 11); },
s = 0;
for(var i=0; i<17; i++) {
s += f(+id.charAt(i), 18-i);
}
var c0 = id.charAt(17),
c1 = modcmpl(s, 11, 1);
return c0-c1===0 || (c0.toLowerCase()==='x' && c1===10);
}
// 15位身份证号
// 2013年1月1日起将停止使用
// http://www.gov.cn/flfg/2011-10/29/content_1981408.htm
function rid15(id) {
var pattern = /[1-9]\d{5}(\d{2})(\d{2})(\d{2})\d{3}/,
matches, y, m, d, date;
matches = id.match(pattern);
y = +('19' + matches[1]);
m = +matches[2];
d = +matches[3];
date = new Date(y, m-1, d);
return (date.getFullYear()===y && date.getMonth()===m-1 && date.getDate()===d);
}
// return rid18(id) || rid15(id);
try {
ret = rid18(id) || rid15(id);
return ret;
} catch (err) {
return false;
}
}
function validateEmail(email) {
// http://stackoverflow.com/questions/46155/validate-email-address-in-javascript
var re = /^(([^<>()[\]\\.,;:\s@\"]+(\.[^<>()[\]\\.,;:\s@\"]+)*)|(\".+\"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/;
return re.test(email);
}
var str = this.CtfId;
if (str && validate_rid(str)) {
var prov = parseInt(str.slice(0, 2));
var year, month, day, sex;
if (str.length == 15) {
year = parseInt('19' + str.slice(6, 8));
month = parseInt(str.slice(8, 10));
day = parseInt(str.slice(10, 12));
sex = parseInt(str.slice(14, 15)) % 2 ? 'M' : 'F';
} else {
year = parseInt(str.slice(6, 10));
month = parseInt(str.slice(10, 12));
day = parseInt(str.slice(12, 14));
sex = parseInt(str.slice(16, 17)) % 2 ? 'M' : 'F';
}
var age = 2013 - year;
var valid_provs = [11, 12, 13, 14, 15,
21, 22, 23, 31, 32, 33, 34, 35, 36, 37,
41, 42, 43, 44, 45, 46,
50, 51, 52, 53, 54,
61, 62, 63, 64, 65,
71, 81, 82, 91];
if (age <= 0 || age > 100 ||
month <=0 || month > 12 ||
day <= 0 || day > 31 ||
valid_provs.indexOf(prov) == -1) {
emit('Corrupted', 1);
} else {
// emit('Province ' + prov, 1);
// emit('Age ' + age, 1);
// emit('Month ' + month, 1);
// emit('Day ' + day, 1);
// emit('Sex ' + sex, 1);
// emit('Prov ' + prov + ' Sex ' + sex, 1);
// if (this.Address && this.Address.length > 3) {
// var cur_prov = this.Address.slice(0, 3);
// emit('From ' + prov + ' to ' + cur_prov, 1);
// }
// var email = this.EMail;
// if (email && validateEmail(email)) {
// var idx = email.lastIndexOf('@');
// var domain = email.slice(idx + 1);
// emit(domain.toLowerCase(), 1);
// }
if (prov == 32 && sex == 'M') {
emit(str, 1);
}
// if (prov == 32 && sex == 'F') {
// emit(str, 1);
// }
}
} else {
emit('Corrupted', 1);
}
}''')
reducer = Code('''
function(key, values) {
return Array.sum(values);
}''')
result = collection.map_reduce(
mapper, reducer, 'aggregation', query={'CtfTp': 'ID'}
)
return result