http://docs.mongodb.org/manual/core/map-reduce/
http://docs.mongodb.org/manual/reference/command/mapReduce/
> db.lattern_money_record.mapReduce( function() { emit(this.quantity, 1) }, function(key, values) { return Array.sum(values) }, { query: {'quantity': {$gt: 500}}, out: {inline: 1} } ) { "results" : [ { "_id" : 550, "value" : 3 }, { "_id" : 570, "value" : 1 }, { "_id" : 580, "value" : 1 }, { "_id" : 583, "value" : 1 }, { "_id" : 587, "value" : 1 }, { "_id" : 600, "value" : 2 }, { "_id" : 660, "value" : 1 }, { "_id" : 700, "value" : 2 }, { "_id" : 800, "value" : 5 }, { "_id" : 900, "value" : 2 }, { "_id" : 924, "value" : 1 }, { "_id" : 949, "value" : 1 }, { "_id" : 980, "value" : 1 }, { "_id" : 990, "value" : 1 }, { "_id" : 1000, "value" : 12 } ], "timeMillis" : 36, "counts" : { "input" : 35, "emit" : 35, "reduce" : 6, "output" : 15 }, "ok" : 1, }
The MapReduce code I used to analyze the 20 million hotel reservation records:
def get_aggregation(collection): ''' 1. Get unique set of people 2. Get most frequent users 3. Get aggregation by location of birth, age, month and day of birth ''' # Emit multiple times in mapper function: # http://docs.mongodb.org/manual/reference/command/mapReduce/ mapper = Code(''' function() { function validate_rid(id) { // From: https://gist.github.com/foxwoods/1817822 // 18位身份证号 // 国家标准《GB 11643-1999》 function rid18(id) { if(! /\d{17}[\dxX]/.test(id)) { return false; } var modcmpl = function(m, i, n) { return (i + n - m % i) % i; }, f = function(v, i) { return v * (Math.pow(2, i-1) % 11); }, s = 0; for(var i=0; i<17; i++) { s += f(+id.charAt(i), 18-i); } var c0 = id.charAt(17), c1 = modcmpl(s, 11, 1); return c0-c1===0 || (c0.toLowerCase()==='x' && c1===10); } // 15位身份证号 // 2013年1月1日起将停止使用 // http://www.gov.cn/flfg/2011-10/29/content_1981408.htm function rid15(id) { var pattern = /[1-9]\d{5}(\d{2})(\d{2})(\d{2})\d{3}/, matches, y, m, d, date; matches = id.match(pattern); y = +('19' + matches[1]); m = +matches[2]; d = +matches[3]; date = new Date(y, m-1, d); return (date.getFullYear()===y && date.getMonth()===m-1 && date.getDate()===d); } // return rid18(id) || rid15(id); try { ret = rid18(id) || rid15(id); return ret; } catch (err) { return false; } } function validateEmail(email) { // http://stackoverflow.com/questions/46155/validate-email-address-in-javascript var re = /^(([^<>()[\]\\.,;:\s@\"]+(\.[^<>()[\]\\.,;:\s@\"]+)*)|(\".+\"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/; return re.test(email); } var str = this.CtfId; if (str && validate_rid(str)) { var prov = parseInt(str.slice(0, 2)); var year, month, day, sex; if (str.length == 15) { year = parseInt('19' + str.slice(6, 8)); month = parseInt(str.slice(8, 10)); day = parseInt(str.slice(10, 12)); sex = parseInt(str.slice(14, 15)) % 2 ? 'M' : 'F'; } else { year = parseInt(str.slice(6, 10)); month = parseInt(str.slice(10, 12)); day = parseInt(str.slice(12, 14)); sex = parseInt(str.slice(16, 17)) % 2 ? 'M' : 'F'; } var age = 2013 - year; var valid_provs = [11, 12, 13, 14, 15, 21, 22, 23, 31, 32, 33, 34, 35, 36, 37, 41, 42, 43, 44, 45, 46, 50, 51, 52, 53, 54, 61, 62, 63, 64, 65, 71, 81, 82, 91]; if (age <= 0 || age > 100 || month <=0 || month > 12 || day <= 0 || day > 31 || valid_provs.indexOf(prov) == -1) { emit('Corrupted', 1); } else { // emit('Province ' + prov, 1); // emit('Age ' + age, 1); // emit('Month ' + month, 1); // emit('Day ' + day, 1); // emit('Sex ' + sex, 1); // emit('Prov ' + prov + ' Sex ' + sex, 1); // if (this.Address && this.Address.length > 3) { // var cur_prov = this.Address.slice(0, 3); // emit('From ' + prov + ' to ' + cur_prov, 1); // } // var email = this.EMail; // if (email && validateEmail(email)) { // var idx = email.lastIndexOf('@'); // var domain = email.slice(idx + 1); // emit(domain.toLowerCase(), 1); // } if (prov == 32 && sex == 'M') { emit(str, 1); } // if (prov == 32 && sex == 'F') { // emit(str, 1); // } } } else { emit('Corrupted', 1); } }''') reducer = Code(''' function(key, values) { return Array.sum(values); }''') result = collection.map_reduce( mapper, reducer, 'aggregation', query={'CtfTp': 'ID'} ) return result
Leave a Comment