从海量数据中找中位数（c语言实现）-C-编程技术-六狼论坛-IT论坛-计算机论坛

kenby 发表于 2013-2-1 11:43:58

从海量数据中找中位数（c语言实现）

题目：5亿个int，从中找出第k大的数

算法：之后补上。。。

实现：

#include <assert.h>#include <fcntl.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include <sys/time.h>#include <sys/types.h>#include <sys/stat.h>typedef struct bucket_t {int *buf;/* 输出缓冲区 */int count;/* 当前有多少个数 */int idx;/* 缓冲区的指针 */} bucket_t;static unsigned int BUF_PAGES;/* 缓冲区有多少个page */static unsigned int PAGE_SIZE;/* page的大小 */static unsigned int BUF_SIZE;/* 缓冲区的大小, BUF_SIZE = BUF_PAGES*PAGE_SIZE */static unsigned int nbuckets;/* 分成多少个桶 */static unsigned int BUCKET_BUF_SIZE;static int *buffer;/* 输入缓冲区 */long get_time_usecs();void write_to_file(bucket_t *bucket, int pos);int partition(int *a, int s, int t);int quick_select(int *a, int s, int t, int i);void swap(int *p, int *q);int main(int argc, char **argv){char filename;unsigned intbp, length, bucket_size, k;intfd, i, bytes;bucket_t*bucket;long start_usecs = get_time_usecs();strcpy(filename, argv);fd = open(filename, O_RDONLY);if (fd < 0) {printf("can't open file %s\n", filename);exit(0);}nbuckets = 1024;k = atoi(argv);PAGE_SIZE = 4096;/* page = 4KB */BUF_PAGES = 1024;BUF_SIZE = PAGE_SIZE*BUF_PAGES;/* 4KB * 1024 = 4M */BUCKET_BUF_SIZE = PAGE_SIZE*128;/* 4KB * 128 = 512KB */buffer = (int *)malloc(BUF_SIZE);//把1-2^32个数分成nbucket个组, nbuckets必须等于2的n次幂bucket = malloc(sizeof(bucket_t)*nbuckets);if (bucket == NULL) exit(0);for (i = 0; i < nbuckets; i++) {bucket.buf = malloc(BUCKET_BUF_SIZE);if (bucket.buf == NULL) {exit(0);}bucket.idx = 0;bucket.count = 0;}bucket_size = (1<<22);/* 分成1024个桶，每个桶容纳2^22个数 */// 读入第一批数据到输入缓冲区 bytes = read(fd, buffer, BUF_SIZE);length = bytes/4;bp = 0;int element, pos;unsigned intbase;bucket_t*p;base = 2147483648;while (1) {//从输入缓冲区取出一个数，加到对应的桶element = buffer;pos = (((long)element)+base)>>22;p = &bucket;p->buf = element;p->count++;//桶内的缓冲区已满，写入文件if (p->idx*4 == BUCKET_BUF_SIZE) {write_to_file(p, pos);p->idx = 0;}//输入缓冲区的数已用完if (bp == length) {bytes = read(fd, buffer, BUF_SIZE);if (bytes == 0) { break;}length = bytes/4;bp = 0;}}//把每个桶剩下的数写入文件for (i = 0; i < nbuckets; i++) {write_to_file(bucket+i, i);}free(buffer);close(fd);buffer = malloc(bucket_size*4);if (buffer == NULL)exit(0); //找出第k大的数位于哪个文件unsigned sum = 0;for (i = 0; i < nbuckets && sum < k; i++) {sum += bucket.count;}i--;//把该文件读入内存sprintf(filename, "foo_%d.dat", i);printf("第%d大的数位于文件%s的第%d大的数\n", k, filename, k+bucket.count-sum);fd = open(filename, O_RDONLY);if (fd < 0) {printf("can't open file %s\n", filename);free(buffer);exit(0);}bytes = read(fd, buffer, bucket_size*4);length = bytes/4;//选择文件内第(k+bucket.count-sum)大的数int answer;answer = quick_select(buffer, 1, length-1, k+bucket.count-sum);printf("第%d大的数 = %d\n", k, answer);close(fd);free(buffer);//free bucketsfor (i = 0; i < nbuckets; i++) {free(bucket.buf);}free(bucket);long end_usecs = get_time_usecs();double secs = (double)(end_usecs - start_usecs) / (double)1000000;printf("it took %.02f seconds.\n", secs);return 0;}void write_to_file(bucket_t *bucket, int pos){charfilename;intfd, bytes;sprintf(filename, "foo_%d.dat", pos);fd = open(filename, O_WRONLY | O_CREAT | O_APPEND, 0666);if (fd < 0) {printf("can't open file %s\n", filename);exit(0);}bytes = write(fd, bucket->buf, bucket->idx*4);if (bucket->idx*4 != bytes) {printf("idx = %d, bytes = %d, write error\n", bucket->idx, bytes);close(fd);exit(0);}close(fd);}long get_time_usecs(){struct timeval time;struct timezone tz;memset(&tz, '\0', sizeof(struct timezone));gettimeofday(&time, &tz);long usecs = time.tv_sec*1000000 + time.tv_usec;return usecs;}void swap(int *p, int *q){inttmp;tmp = *p;*p = *q;*q = tmp;}/* 把a作为参考，将数组分成三部分: 小于等于a， * a以及大于a，分割完毕后，a所在的下标即是a的顺序 */int partition(int *a, int s, int t){inti, j;/* i用来遍历a...a, j指向大于x部分的第一个元素 */for (i = j = s; i < t; i++) {if (a < a) {swap(a+i, a+j);j++;}}swap(a+j, a+t);return j;}/* 选择数组中第i大的元素并返回 */int quick_select(int *a, int s, int t, int i){intp, m;if (s == t) return a;p = partition(a, s, t);m = p - s + 1;if (m == i) return a;if (m > i) {return quick_select(a, s, p-1, i);}return quick_select(a, p+1, t, i-m);} 运行和测试：
寻找第1111大的整数

dd if=/dev/urandom of=random.dat bs=1M count=1024
gcc main.c

页: [1]

六狼论坛's Archiver

从海量数据中找中位数（c语言实现）