/*
 * SPDX-License-Identifier: GPL-2.0
 *
 * Copyright (c) 2018 Intel Corporation
 *
 * Authors: Fengguang Wu <fengguang.wu@intel.com>
 *          Liu Jingqi <jingqi.liu@intel.com>
 *          Peng Bo <bo2.peng@intel.com>
 */

#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <inttypes.h>
#include <libgen.h>
#include <linux/limits.h>
#include <linux/kernel.h>
#include <linux/kernel-page-flags.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/user.h>
#include <unistd.h>

#include "lib/memparse.h"
#include "lib/iomem_parse.h"
#include "lib/page-types.h"

#define IDLEMAP_CHUNK_SIZE	8
#define IDLEMAP_BUF_SIZE	(1<<20)

// 1TB
#define MAX_IDLEMAP_SIZE	(1UL<<40)

#define PFN_IDLE_BITMAP_PATH	"/sys/kernel/mm/page_idle/bitmap"
#define KERNEL_PAGE_FLAGS	"/proc/kpageflags"
#define GNUPLOT_PATH		"/usr/bin/gnuplot"
#define PLOT_SCRIPT		"plot-page-refs"

// globals
int g_debug;			// 1 == some, 2 == verbose
uint64_t *g_idlebuf;
int g_idlebuf_size;
int g_idlebuf_len;
unsigned short *g_refs_count;
unsigned short *g_refs_2m_count;
unsigned long g_offset, g_size;
unsigned long g_start_pfn, g_num_pfn, g_sim_2m_num_pfn;
unsigned char g_setidle_buf[IDLEMAP_BUF_SIZE];
int g_setidle_bufsize;
int g_idlefd;
int g_kpageflags_fd;
uint64_t *g_kpageflags_buf;
int g_kpageflags_buf_size;
int g_kpageflags_buf_len;
int page_shift;
int idle_shift;

int verbose_printf(int level, const char *format, ...)
{
	if (g_debug < level)
		return 0;

	va_list args;
	va_start(args, format);
	int ret = vprintf(format, args);
	va_end(args);

	return ret;
}

#define debug_printf(fmt, args...)	verbose_printf(1, fmt, ##args)
#define printdd(fmt, args...)		verbose_printf(2, fmt, ##args)

// on return:
// 	for nrefs in [0, max]:
// 		count_array[nrefs] = npages;
int count_refs(unsigned int max,
			unsigned long count_4k_array[],
			unsigned long count_2m_array[])
{
	unsigned long pfn;
	unsigned short nrefs;

	memset(count_4k_array, 0, (max + 1) * sizeof(count_4k_array[0]));
	memset(count_2m_array, 0, (max + 1) * sizeof(count_2m_array[0]));

	for (pfn = 0; pfn < g_num_pfn; pfn++) {
		nrefs = g_refs_count[pfn];
		if (nrefs > max)
			return 1;
		if (g_kpageflags_buf) {
			if (!(g_kpageflags_buf[pfn] & (1<<KPF_LRU))) {
				printdd("skip non-LRU %lx\n", pfn);
				continue;
			}
			if (g_kpageflags_buf[pfn] & ((1<<KPF_HUGE) | (1<<KPF_THP))) {
				count_2m_array[nrefs]++;
				printdd("pfn 0x%lx is a huge page.\n", pfn);
				if (!(g_kpageflags_buf[pfn] & (1<<KPF_COMPOUND_HEAD)))
					printf("pfn 0x%lx is non-head huge page.\n", pfn);
				continue;
			}
		}

		count_4k_array[nrefs]++;
	}

	return 0;
}

int count_sim_2m_refs(unsigned int max,
				unsigned long count_sim_2m_array[])
{
	unsigned long long pfn;
	unsigned short nrefs;

	memset(count_sim_2m_array, 0,
			(max + 1) * sizeof(count_sim_2m_array[0]));

	for (pfn = 0; pfn < g_sim_2m_num_pfn; pfn++) {
		nrefs = g_refs_2m_count[pfn];
		if (nrefs <= max) {
			if (g_kpageflags_buf &&
			    !(g_kpageflags_buf[pfn * 512] & (1<< KPF_LRU)))
				continue;
			count_sim_2m_array[nrefs]++;
		}
		else
			return 1;
	}

	return 0;
}

int output_refs_count(unsigned int loop, const char *output_file)
{
	unsigned short nrefs;
	FILE *file;
	unsigned long refs_4k_count[loop + 1];
	unsigned long refs_2m_count[loop + 1];
	unsigned long refs_sim_2m_count[loop + 1];

	if (count_refs(loop, refs_4k_count, refs_2m_count)) {
		fprintf(stderr, "refs count out of range\n");
		return -1;
	}

	if (count_sim_2m_refs(loop, refs_sim_2m_count)) {
		fprintf(stderr, "refs count out of range\n");
		return -1;
	}

	if ((file = fopen(output_file, "w")) == NULL) {
		perror("Can't open to write the output file");
		exit(2);
	}

	fprintf(file, "%-8s %-15s %-15s %-15s\n",
				"refs", "count_4K",
				"count_2M", "count_simulate_2M");
	fprintf(file, "=========================================================\n");

	for (nrefs = 0; nrefs <= loop; nrefs++) {
		fprintf(file, "%-8u %-15lu %-15lu %-15lu\n",
				(unsigned int)nrefs,
				refs_4k_count[nrefs],
				refs_2m_count[nrefs],
				refs_sim_2m_count[nrefs]);
	}
	fclose(file);
	return 0;
}

int output_pfn_refs(const char *output_file)
{
	FILE *file;
	unsigned long pfn;

	if (!g_refs_count)
		return -1;

	if ((file = fopen(output_file, "w")) == NULL) {
		perror("Can't open to write the output file");
		exit(2);
	}

	fprintf(file, "%-18s  refcount\n", "PFN");
	for (pfn = 0; pfn < g_num_pfn; pfn++) {
		fprintf(file, "0x%016lx  %3d  %s  %s\n",
			g_start_pfn + pfn,
			g_refs_count[pfn],
			g_kpageflags_buf ? page_flag_name(g_kpageflags_buf[pfn]) : "",
			g_kpageflags_buf ? page_flag_longname(g_kpageflags_buf[pfn]) : "");
	}
	fclose(file);
	return 0;
}

int account_refs(void)
{
	uint64_t idlebits, idlemap = 0;
	unsigned long len = 0;
	unsigned long base_pfn_2m = 0, base_pfn = 0;
	unsigned long pfn_2m = 0, pfn;
	int bit;
	int count_2m = 1;

	while (len < g_idlebuf_len) {
		idlebits = g_idlebuf[idlemap];
		printdd("idlebits: 0x%" PRIu64 "\n", idlebits);

		for (bit = 0; bit < 64; bit++) {
			if (!(idlebits & (1ULL << bit))) {
				pfn = base_pfn + bit;
				if (pfn >= g_num_pfn)
					break;
				g_refs_count[pfn]++;

				//only count once in a 2M page
				if (count_2m) {
					pfn_2m = pfn / 512;
					g_refs_2m_count[pfn_2m]++;
					count_2m = 0;
				}
				printdd("pfn 0x%lx refs_count 0x%lx\n",
					pfn, g_refs_count[pfn]);
			}
		}

		len += IDLEMAP_CHUNK_SIZE;
		idlemap = len / IDLEMAP_CHUNK_SIZE;
		base_pfn = idlemap * 64;
		base_pfn_2m = base_pfn / 512;
		if (base_pfn_2m > pfn_2m)
			count_2m = 1;
	}

	return 0;
}

int setidlemap(unsigned long offset, unsigned long size)
{
	ssize_t len = 0;

	if (lseek(g_idlefd, offset >> idle_shift, SEEK_SET) < 0) {
		perror("Can't seek bitmap file");
		exit(3);
	}

	while (size) {
		len = write(g_idlefd, g_setidle_buf, g_setidle_bufsize);

		if (len < 0) {
			// reach max PFN
			if (errno == ENXIO)
				return 0;

			perror("setidlemap: error writing idle bitmap");
			return len;
		}
		size -= len << idle_shift;
	}

	return 0;
}

int loadidlemap(unsigned long offset, unsigned long size)
{
	char *p;
	ssize_t len = 0;
	int err = 0;

	if (lseek(g_idlefd, offset >> idle_shift, SEEK_SET) < 0) {
		printf("Can't seek bitmap file");
		err = -4;
		goto _loadidlemap_out;
	}

	p = (char *)g_idlebuf;
	g_idlebuf_len = 0;

	// unfortunately, larger reads do not seem supported
	while ((len = read(g_idlefd, p, g_idlebuf_size - g_idlebuf_len)) > 0) {
		p += len;
		g_idlebuf_len += len;
		size -= len << idle_shift;
		if (size <= 0)
			break;
	}
	debug_printf("g_idlebuf_len: 0x%lx\n", g_idlebuf_len);

_loadidlemap_out:
	return err;
}

int loadflags(unsigned long pfn, unsigned long num_pfn)
{
	char *p;
	ssize_t len = 0;
	int err = 0;

	g_kpageflags_fd = open(KERNEL_PAGE_FLAGS, O_RDONLY);
	if (g_kpageflags_fd < 0) {
		printf("Can't read file %s!\n", KERNEL_PAGE_FLAGS);
		err = -3;
		goto out;
	}

	g_kpageflags_buf_size = g_num_pfn * sizeof(unsigned long long);
	g_kpageflags_buf = calloc(g_num_pfn, sizeof(unsigned long long));
	if (!g_kpageflags_buf) {
		printf("Can't allocate memory for kpageflags buf (%d bytes)!\n",
		       g_kpageflags_buf_size);
		err = -1;
		goto out;
	}

	if (lseek(g_kpageflags_fd,
		  pfn * sizeof(unsigned long long), SEEK_SET) < 0) {
		printf("Can't seek kernel page flags file");
		err = -4;
		goto out;
	}

	p = (char *)g_kpageflags_buf;
	g_kpageflags_buf_len = 0;

	// unfortunately, larger reads do not seem supported
	while ((len = read(g_kpageflags_fd, p,
			   g_kpageflags_buf_size - g_kpageflags_buf_len)) > 0) {
		p += len;
		g_kpageflags_buf_len += len;
		num_pfn--;
		if (num_pfn <= 0)
			break;
	}
	debug_printf("g_kpageflags_buf_len: 0x%lx\n", g_kpageflags_buf_len);

out:
	return err;
}

static const struct option opts[] = {
	{"offset",	required_argument,	NULL,	'o'},
	{"size",	required_argument,	NULL,	's'},
	{"range",	required_argument,	NULL,	'r'},
	{"interval",	required_argument,	NULL,	'i'},
	{"loop",	required_argument,	NULL,	'l'},
	{"bitmap",	required_argument,	NULL,	'b'},
	{"output",	required_argument,	NULL,	'f'},
	{"verbose",	required_argument,	NULL,	'v'},
	{"help",	no_argument,		NULL,	'h'},
	{NULL,		0,			NULL,	0}
};

static void usage(char *prog)
{
	fprintf(stderr,
		"%s [option] ...\n"
		"    -h|--help		Show this information.\n"
		"    -o|--offset	The start offset of the bitmap to be scanned.\n"
		"    -s|--size		The size should be scanned.\n"
		"    -r|--range		The hex addr range to scan.\n"
		"    -i|--interval	The interval to scan bitmap.\n"
		"    -l|--loop		The number of times to scan bitmap.\n"
		"    -b|--bitmap	The bitmap file for scanning.\n"
		"    -f|--output	The output file for the result of scanning.\n"
		"    -v|--verbose	Show debug info.\n",
		prog);
}

static inline unsigned long long tv_delta(struct timeval t1, struct timeval t2)
{
	return  (t2.tv_sec  - t1.tv_sec) * 1000000 +
		(t2.tv_usec - t1.tv_usec);
}

static int plot_output(char *argv[], char *output_file)
{
	char mypath[PATH_MAX];
	struct stat sb;

	if (stat(GNUPLOT_PATH, &sb))
		return -1;

	if (!(sb.st_mode & S_IXUSR))
		return -2;

	if (!realpath(argv[0], mypath))
		return -3;

	return system(strcat(dirname(mypath), "/" PLOT_SCRIPT));
}


int main(int argc, char *argv[])
{
	int i, ret = 0;
	double interval = 0.1;
	unsigned short loop = 1;
	int pagesize, opt = 0, options_index = 0;
	char bitmap_file[PATH_MAX] = PFN_IDLE_BITMAP_PATH;
	char output_file[PATH_MAX] = "refs_count";
	char pfn_outfile[PATH_MAX] = "pfn_refs";
	const char *optstr = "hvo:s:r:i:l:b:f:";
	unsigned long offset = 0, size = 0;
	unsigned int bufsize;
	unsigned long long set_us, read_us, dur_us, slp_us, account_us;
	static struct timeval ts1, ts2, ts3, ts4, ts5;
	int is_pfn_bitmap;

	/* parse the options */
	while ((opt = getopt_long(argc, argv, optstr, opts, &options_index)) != EOF) {
		switch (opt) {
		case 0:
			break;
		case 'o':
			offset = memparse(optarg, NULL);
			break;
		case 's':
			size = memparse(optarg, NULL);
			break;
		case 'r':
			if (sscanf(optarg, "%lx-%lx", &offset, &size) != 2) {
				fprintf(stderr, "range format: hex_start-hex_end\n");
				exit(1);
			}
			size -= offset;
			break;
		case 'i':
			interval = atof(optarg);
			debug_printf("interval = %.2f\n", interval);
			break;
		case 'l':
			loop = atoi(optarg);
			debug_printf("loop number = %d\n", loop);
			break;
		case 'b':
			strcpy(bitmap_file, optarg);
			debug_printf("bitmap_file = %s\n", bitmap_file);
			break;
		case 'f':
			strcpy(output_file, optarg);
			debug_printf("output_file = %s\n", output_file);
			break;
		case 'v':
			g_debug++;
			break;
		case 'h':
		case '?':
		default:
			usage(argv[0]);
			exit(0);
		}
	}

	is_pfn_bitmap = !strcmp(bitmap_file, PFN_IDLE_BITMAP_PATH);

	pagesize = getpagesize();
	page_shift = ffs(pagesize) - 1;
	if (is_pfn_bitmap)
		idle_shift = page_shift + 3;
	else
		idle_shift = page_shift + 3; // may be different later

	offset = offset & ~(unsigned long)(pagesize * 64 - 1);
	size = __ALIGN_KERNEL(size, pagesize * 64);

	printf("Watching the %s to calculate page reference count every %.2f seconds for %d times.\n"
	       "The result will be saved in %s.\n",
	       bitmap_file,
	       interval,
	       loop,
	       output_file);

	if (!size)
		size = MAX_IDLEMAP_SIZE;

	bufsize = size / (pagesize * 8);

	g_setidle_bufsize = bufsize < sizeof(g_setidle_buf) ?
			    bufsize : sizeof(g_setidle_buf);
	memset(g_setidle_buf, 0xff, g_setidle_bufsize);

	if ((g_idlefd = open(bitmap_file, O_RDWR)) < 0) {
		perror("Can't write bitmap file");
		exit(2);
	}

	if ((g_idlebuf = malloc(bufsize)) == NULL) {
		printf("Can't allocate memory for idlemap buf (%d bytes)!\n",
		       bufsize);
		ret = -1;
		goto out;
	}
	debug_printf("size: 0x%lx, bufsize: 0x%lx\n", size, bufsize);
	g_idlebuf_size = bufsize;

	g_start_pfn = offset / pagesize;
	g_num_pfn = size / pagesize;
	debug_printf("The start pfn is 0x%lx, the number of pfn is 0x%lx.\n",
		     g_start_pfn, g_num_pfn);

	g_refs_count = calloc(g_num_pfn, sizeof(unsigned short));
	if (g_refs_count == NULL) {
		printf("Can't allocate memory for refs count buf (%lu bytes)!\n",
		       sizeof(unsigned short) * g_num_pfn);
		ret = -2;
		goto out;
	}

	g_sim_2m_num_pfn = (g_num_pfn + 511) / 512;
	debug_printf("g_sim_2m_num_pfn: 0x%lx\n", g_sim_2m_num_pfn);
	g_refs_2m_count =
			calloc(g_sim_2m_num_pfn, sizeof(unsigned short));
	if (g_refs_2m_count == NULL) {
		printf("Can't allocate memory for 2M refs count buf (%lu bytes)!\n",
		       sizeof(unsigned short) * g_sim_2m_num_pfn);
		ret = -4;
		goto out;
	}

	if (is_pfn_bitmap) {
		ret = loadflags(g_start_pfn, g_num_pfn);
		if (ret)
			goto out;
	}

	for (i = 0; i < loop; i++) {
		// set idle flags
		gettimeofday(&ts1, NULL);
		// the per-task idle bitmap will auto clear A bits on read
		if (is_pfn_bitmap)
			ret = setidlemap(offset, size);
		if (ret)
			goto out;

		// sleep
		gettimeofday(&ts2, NULL);
		usleep((int)(interval * 1000000));
		gettimeofday(&ts3, NULL);

		// read idle flags
		ret = loadidlemap(offset, size);
		if (ret)
			goto out;

		gettimeofday(&ts4, NULL);
		ret = account_refs();
		if (ret)
			goto out;
		gettimeofday(&ts5, NULL);

		// calculate times
		set_us = tv_delta(ts1, ts2);
		slp_us = tv_delta(ts2, ts3);
		read_us = tv_delta(ts3, ts4);
		account_us = tv_delta(ts4, ts5);
		dur_us = tv_delta(ts1, ts5);

		if (g_debug) {
			printf("set time   : %.6f s\n", (double)set_us / 1000000);
			printf("sleep time : %.6f s\n", (double)slp_us / 1000000);
			printf("read time  : %.6f s\n", (double)read_us / 1000000);
			printf("count time : %.6f s\n", (double)account_us / 1000000);
			printf("dur time   : %.6f s\n", (double)dur_us / 1000000);
		}
	}

	output_refs_count(loop, output_file);
	if (g_debug)
		output_pfn_refs(pfn_outfile);
	plot_output(argv, output_file);

out:
	if (g_idlefd)
		close(g_idlefd);
	if (g_kpageflags_fd)
		close(g_kpageflags_fd);

	if (g_idlebuf)
		free(g_idlebuf);
	if (g_refs_count)
		free(g_refs_count);
	if (g_kpageflags_buf)
		free(g_kpageflags_buf);

	return ret;
}