Skip to content →

Tag: MapReduce

MapReduce in MongoDB

http://docs.mongodb.org/manual/core/map-reduce/

http://docs.mongodb.org/manual/reference/command/mapReduce/

> db.lattern_money_record.mapReduce( function() { emit(this.quantity, 1) }, function(key, values) { return Array.sum(values) }, {   query: {'quantity': {$gt: 500}}, out: {inline: 1} } )
{
	"results" : [
		{
			"_id" : 550,
			"value" : 3
		},
		{
			"_id" : 570,
			"value" : 1
		},
		{
			"_id" : 580,
			"value" : 1
		},
		{
			"_id" : 583,
			"value" : 1
		},
		{
			"_id" : 587,
			"value" : 1
		},
		{
			"_id" : 600,
			"value" : 2
		},
		{
			"_id" : 660,
			"value" : 1
		},
		{
			"_id" : 700,
			"value" : 2
		},
		{
			"_id" : 800,
			"value" : 5
		},
		{
			"_id" : 900,
			"value" : 2
		},
		{
			"_id" : 924,
			"value" : 1
		},
		{
			"_id" : 949,
			"value" : 1
		},
		{
			"_id" : 980,
			"value" : 1
		},
		{
			"_id" : 990,
			"value" : 1
		},
		{
			"_id" : 1000,
			"value" : 12
		}
	],
	"timeMillis" : 36,
	"counts" : {
		"input" : 35,
		"emit" : 35,
		"reduce" : 6,
		"output" : 15
	},
	"ok" : 1,
}

The MapReduce code I used to analyze the 20 million hotel reservation records:

def get_aggregation(collection):
    '''
    1. Get unique set of people
    2. Get most frequent users
    3. Get aggregation by location of birth, age, month and day of birth
    '''
    # Emit multiple times in mapper function:
    # http://docs.mongodb.org/manual/reference/command/mapReduce/
    mapper = Code('''
                  function() {
                    function validate_rid(id) {
                        // From: https://gist.github.com/foxwoods/1817822
                        // 18位身份证号
                        // 国家标准《GB 11643-1999》
                        function rid18(id) {
                            if(! /\d{17}[\dxX]/.test(id)) {
                                return false;
                            }
                            var modcmpl = function(m, i, n) { return (i + n - m % i) % i; },
                                f = function(v, i) { return v * (Math.pow(2, i-1) % 11); },
                                s = 0;
                            for(var i=0; i<17; i++) {
                                s += f(+id.charAt(i), 18-i);
                            }
                            var c0 = id.charAt(17),
                                c1 = modcmpl(s, 11, 1);
                            return c0-c1===0 || (c0.toLowerCase()==='x' && c1===10);
                        }

                        // 15位身份证号
                        // 2013年1月1日起将停止使用
                        // http://www.gov.cn/flfg/2011-10/29/content_1981408.htm
                        function rid15(id) {
                            var pattern = /[1-9]\d{5}(\d{2})(\d{2})(\d{2})\d{3}/,
                                matches, y, m, d, date;
                            matches = id.match(pattern);
                            y = +('19' + matches[1]);
                            m = +matches[2];
                            d = +matches[3];
                            date = new Date(y, m-1, d);
                            return (date.getFullYear()===y && date.getMonth()===m-1 && date.getDate()===d);
                        }

                        // return rid18(id) || rid15(id);
                        try {
                            ret = rid18(id) || rid15(id);
                            return ret;
                        } catch (err) {
                            return false;
                        }
                    }

                    function validateEmail(email) {
                        // http://stackoverflow.com/questions/46155/validate-email-address-in-javascript
                        var re = /^(([^<>()[\]\\.,;:\s@\"]+(\.[^<>()[\]\\.,;:\s@\"]+)*)|(\".+\"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/;
                        return re.test(email);
                    }

                    var str = this.CtfId;
                    if (str && validate_rid(str)) {
                        var prov = parseInt(str.slice(0, 2));
                        var year, month, day, sex;
                        if (str.length == 15) {
                            year = parseInt('19' + str.slice(6, 8));
                            month = parseInt(str.slice(8, 10));
                            day = parseInt(str.slice(10, 12));
                            sex = parseInt(str.slice(14, 15)) % 2 ? 'M' : 'F';
                        } else {
                            year = parseInt(str.slice(6, 10));
                            month = parseInt(str.slice(10, 12));
                            day = parseInt(str.slice(12, 14));
                            sex = parseInt(str.slice(16, 17)) % 2 ? 'M' : 'F';
                        }
                        var age = 2013 - year;
                        var valid_provs = [11, 12, 13, 14, 15,
                            21, 22, 23, 31, 32, 33, 34, 35, 36, 37,
                            41, 42, 43, 44, 45, 46,
                            50, 51, 52, 53, 54,
                            61, 62, 63, 64, 65,
                            71, 81, 82, 91];
                        if (age <= 0 || age > 100 ||
                            month <=0 || month > 12 ||
                            day <= 0 || day > 31 ||
                            valid_provs.indexOf(prov) == -1) {
                            emit('Corrupted', 1);
                        } else {
                            // emit('Province ' + prov, 1);
                            // emit('Age ' + age, 1);
                            // emit('Month ' + month, 1);
                            // emit('Day ' + day, 1);
                            // emit('Sex ' + sex, 1);
                            // emit('Prov ' + prov + ' Sex ' + sex, 1);
                            // if (this.Address && this.Address.length > 3) {
                            //     var cur_prov = this.Address.slice(0, 3);
                            //     emit('From ' + prov + ' to ' + cur_prov, 1);
                            // }

                            // var email = this.EMail;
                            // if (email && validateEmail(email)) {
                            //     var idx = email.lastIndexOf('@');
                            //     var domain = email.slice(idx + 1);
                            //     emit(domain.toLowerCase(), 1);
                            // }

                            if (prov == 32 && sex == 'M') {
                                emit(str, 1);
                            }
                            // if (prov == 32 && sex == 'F') {
                            //     emit(str, 1);
                            // }
                        }
                    } else {
                        emit('Corrupted', 1);
                    }
                  }''')
    reducer = Code('''
                   function(key, values) {
                    return Array.sum(values);
                   }''')
    result = collection.map_reduce(
        mapper, reducer, 'aggregation', query={'CtfTp': 'ID'}
    )
    return result

 

Leave a Comment