#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <sys/resource.h>
#include <inttypes.h>
#include <string.h>
#include <sys/mman.h>
#include <errno.h>
#if defined __SSE2__
#include <emmintrin.h>
#define VECTYPE __m128i
#define SPLAT(p) _mm_set1_epi8(*(p))
#define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) ==
0xFFFF)
#else
#define VECTYPE unsigned long
#define SPLAT(p) (*(p) * (~0UL / 255))
#define ALL_EQ(v1, v2) ((v1) == (v2))
#endif
#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
/* Round number down to multiple */
#define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m))
/* Round number up to multiple */
#define QEMU_ALIGN_UP(n, m) QEMU_ALIGN_DOWN((n) + (m) - 1, (m))
#define QEMU_VMALLOC_ALIGN (256 * 4096)
/* alloc shared memory pages */
void *qemu_anon_ram_alloc(size_t size)
{
size_t align = QEMU_VMALLOC_ALIGN;
size_t total = size + align - getpagesize();
void *ptr = mmap(0, total, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
size_t offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
if (ptr == MAP_FAILED) {
fprintf(stderr, "Failed to allocate %zu B: %s\n",
size, strerror(errno));
abort();
}
ptr += offset;
total -= offset;
if (offset > 0) {
munmap(ptr - offset, offset);
}
if (total > size) {
munmap(ptr + size, total - size);
}
return ptr;
}
static inline int
can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
{
return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
* sizeof(VECTYPE)) == 0
&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
}
size_t buffer_find_nonzero_offset(const void *buf, size_t len)
{
const VECTYPE *p = buf;
const VECTYPE zero = (VECTYPE){0};
size_t i;
if (!len) {
return 0;
}
assert(can_use_buffer_find_nonzero_offset(buf, len));
for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
if (!ALL_EQ(p[i], zero)) {
return i * sizeof(VECTYPE);
}
}
for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
i < len / sizeof(VECTYPE);
i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
VECTYPE tmp0 = p[i + 0] | p[i + 1];
VECTYPE tmp1 = p[i + 2] | p[i + 3];
VECTYPE tmp2 = p[i + 4] | p[i + 5];
VECTYPE tmp3 = p[i + 6] | p[i + 7];
VECTYPE tmp01 = tmp0 | tmp1;
VECTYPE tmp23 = tmp2 | tmp3;
if (!ALL_EQ(tmp01 | tmp23, zero)) {
break;
}
}
return i * sizeof(VECTYPE);
}
int main()
{
//char *x = malloc(1024 << 20);
char *x = qemu_anon_ram_alloc(1024 << 20);
int i, j;
int ret = 0;
struct rusage rusage;
for (i = 0; i < 500; i ++) {
for (j = 0; j < 10 << 20; j += 4096) {
ret += buffer_find_nonzero_offset((char*) (x + (i << 20)
+ j), 4096);
}
getrusage( RUSAGE_SELF, &rusage );
printf("read offset: %d kB, RSS size: %ld kB", ((i+1) << 10),
rusage.ru_maxrss);
getchar();
}
printf("%d zero pages\n", ret);
}