// The program (sieve of erastophanes):
// #define mem[0] i
// #define mem[1] j
// for (i = 2; i < 1024; i++) mem[i] = i;
// i = 1; // So the scan will catch 2. mem[1] will be ignored.
// while (1) {
//   // Scan to next prime
//   do {i++; if (i == 1024) goto done; } while (mem[i] == 0);
//   // i is now prime. Zero all of its multiples.
//   for (j = 2*i; j < 1024; j+=i) mem[j] = 0;
// }
// done: halt
#include <stdint.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <unistd.h>
#include <iostream>
#include <iomanip>
#include <map>

typedef uint64_t addr_t;
typedef uint32_t word_t;

class Op;

class Machine {
public:
  bool running;

  word_t acc;        // Accumulator
  word_t mem[1<<26]; // Memory

  Op *next; // Pointer to the next operation.

  Machine() : running(true) { for(unsigned i = 0; i < 1<<26; i++) mem[i] = 0; }

  void show_state() {
    using std::cout; using std::endl;    using std::hex; 
    using std::setw; using std::setfill; using std::dec;

    cout << "A = 0x" << hex << setw(4) << setfill('0') << acc << '\n';
    for (unsigned start_i = 0; start_i < 64; start_i += 8) {
      cout << "0x" << hex << setw(4) << setfill('0') << start_i << ':';
      for (unsigned i = start_i; i < start_i + 8; i++) {
        cout << dec << ' ' << setw(4) << setfill('0') << mem[i];
      } 
      cout << endl;
    }
    cout << endl;
  }
};

enum op_type {
  HALT, LOAD, INDIRECT_LOAD, STORE, INDIRECT_STORE, BRANCH_ALWAYS, BRANCH_ZERO,
  BRANCH_NOT_ZERO, MEM_ADD, MEM_SUB, MEM_NAND, IMM_ADD, IMM_SUB, IMM_NAND, 
  IMM_LOAD
};

// Dispatch table and micro-op impementations.
extern "C" {
  Machine *__m;

  void __halt(uint64_t)               { __m->running = false; }
  void __load(uint64_t arg)           { __m->acc = __m->mem[arg]; }
  void __indirect_load(uint64_t arg)  { __m->acc = __m->mem[__m->mem[arg]]; }
  void __store(uint64_t arg)          { __m->mem[arg] = __m->acc; }
  void __indirect_store(uint64_t arg) { __m->mem[__m->mem[arg]] = __m->acc; }
  void __branch_always(uint64_t arg)  { __m->next = (Op*)arg; }
  void __branch_zero(uint64_t arg)    {if(__m->acc==0) __m->next = (Op*)arg;}
  void __branch_not_zero(uint64_t arg){if(__m->acc!=0) __m->next = (Op*)arg;}
  void __mem_add(uint64_t arg)        { __m->acc += __m->mem[arg]; }
  void __mem_sub(uint64_t arg)        { __m->acc -= __m->mem[arg]; }
  void __mem_nand(uint64_t arg)       { __m->acc = ~(__m->acc&__m->mem[arg]);}
  void __imm_add(uint64_t arg)        { __m->acc += arg; }
  void __imm_sub(uint64_t arg)        { __m->acc -= arg; }
  void __imm_nand(uint64_t arg)       { __m->acc = ~(__m->acc&arg); }
  void __imm_load(uint64_t arg)       { __m->acc = arg; }

  typedef void (*dispatch_ent)(uint64_t);

  dispatch_ent disp_table[] = {
    __halt,           __load,          __indirect_load,   __store, 
    __indirect_store, __branch_always, __branch_zero,     __branch_not_zero,
    __mem_add,        __mem_sub,       __mem_nand,        __imm_add,
    __imm_sub,        __imm_nand,      __imm_load
  };

  uint8_t *code_cache;

  uint64_t run_code_cache(uint8_t *loc) __attribute__((noinline, regparm(1)));

  uint64_t run_code_cache(uint8_t *loc) {
    asm("push %%rbx\nmov $0, %%rbx\njmp *%0\n"::"r"(loc));
    return 0;
  }
};

class Op {
public:
  op_type  type;
  uint64_t arg;

  Op(op_type o)             : type(o) {}
  Op(op_type o, word_t arg) : type(o), arg(uint64_t(arg)) {}
  Op(op_type o, Op& target) : type(o), arg(uint64_t(&target)) {}

  virtual ~Op() {}
  void execute(Machine& m) { __m = &m; disp_table[type](arg); }
  
  bool is_branch() { return (type == BRANCH_ALWAYS  || 
                             type == BRANCH_ZERO    || 
                             type == BRANCH_NOT_ZERO); }
};

// Initialize the program and run it.
int main() {

  Machine *m = new Machine();

  const unsigned PROG_SIZE = 31;

  Op program[] = {
    Op(IMM_LOAD,       2),             //  0
    Op(STORE,          0),             //  1

    Op(INDIRECT_STORE, 0),             //  2 INIT1
    Op(IMM_SUB,        (1<<26)-1),     //  3
    Op(BRANCH_ZERO,    program[9]),    //  4
    Op(LOAD,           0),             //  5
    Op(IMM_ADD,        1),             //  6
    Op(STORE,          0),             //  7
    Op(BRANCH_ALWAYS,  program[2]),    //  8 

    Op(IMM_LOAD,       1),             //  9 INIT2
    Op(STORE,          0),             // 10

    Op(LOAD,           0),             // 11 MAIN
    Op(IMM_ADD,        1),             // 12
    Op(STORE,          0),             // 13
    Op(IMM_SUB,        (1<<26)),       // 14
    Op(BRANCH_ZERO,    program[30]),   // 15
    Op(INDIRECT_LOAD,  0),             // 16
    Op(BRANCH_ZERO,    program[11]),   // 17
    Op(LOAD,           0),             // 18
    Op(STORE,          1),             // 19

      Op(LOAD,           0),           // 20 ZERO
      Op(MEM_ADD,        1),           // 21
      Op(STORE,          1),           // 22
      Op(IMM_SUB,        1<<26),       // 23
      Op(IMM_NAND,       0x80000000),  // 24
      Op(IMM_NAND,       0xffffffff),  // 25
      Op(BRANCH_ZERO,    program[11]), // 26

      Op(IMM_LOAD,       0),           // 27
      Op(INDIRECT_STORE, 1),           // 28
      Op(BRANCH_ALWAYS,  program[20]), // 29

    Op(HALT),                          // 30 DONE
  };

  // Create a code cache.
  code_cache = (uint8_t*)mmap(NULL, sysconf(_SC_PAGESIZE), 
                         PROT_EXEC|PROT_READ|PROT_WRITE, 
                         MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);

  if (!code_cache) { std::cout << "Could not mmap a code cache.\n"; exit(1); }

  // Perform translation
  uint8_t *block_loc[PROG_SIZE];
  uint8_t *tc_pos = code_cache;
  for (unsigned i = 0; i < PROG_SIZE; i++) {
    block_loc[i] = tc_pos;

    word_t *acc_addr = &(m->acc);
    word_t *mem_base = m->mem;

    if (program[i].type == IMM_ADD) {
      *(tc_pos++) = 0x48;                     // mov &acc, %rax
      *(tc_pos++) = 0xb8;
      *((uint32_t**)tc_pos) = acc_addr;       tc_pos+=8;
      *(tc_pos++) = 0x81;                     // add (%rax), arg
      *(tc_pos++) = 0x00;
      *((uint32_t*)tc_pos) = program[i].arg;  tc_pos+=4;
    } else if (program[i].type == IMM_SUB) {
      *(tc_pos++) = 0x48;                     // mov &acc, %rax
      *(tc_pos++) = 0xb8;
      *((uint32_t**)tc_pos) = acc_addr;       tc_pos+=8;
      *(tc_pos++) = 0x81;                     // sub (%eax), arg
      *(tc_pos++) = 0x28;
      *((uint32_t*)tc_pos) = program[i].arg;  tc_pos+=4;
    } else if (program[i].type == LOAD) {
      *(tc_pos++) = 0x48;                     // mov (mem_base+arg), %rax
      *(tc_pos++) = 0xb8;
      *((uint32_t**)tc_pos) = &mem_base[program[i].arg]; tc_pos += 8;
      *(tc_pos++) = 0x48;                     // mov &acc, %rcx
      *(tc_pos++) = 0xb9;
      *((uint32_t**)tc_pos) = acc_addr;       tc_pos+=8;
      *(tc_pos++) = 0x8b;                     // mov (%rax), %eax
      *(tc_pos++) = 0x00;
      *(tc_pos++) = 0x89;                     // mov %eax, (%rcx)
      *(tc_pos++) = 0x01;
    } else if (program[i].type == STORE) {
      *(tc_pos++) = 0x48;                     // mov (mem_base+arg), %rax
      *(tc_pos++) = 0xb8;
      *((uint32_t**)tc_pos) = &mem_base[program[i].arg]; tc_pos += 8;
      *(tc_pos++) = 0x48;                     // mov &acc, %rcx
      *(tc_pos++) = 0xb9;
      *((uint32_t**)tc_pos) = acc_addr;       tc_pos+=8;
      *(tc_pos++) = 0x8b;                     // mov (%rcx), %ecx
      *(tc_pos++) = 0x09;
      *(tc_pos++) = 0x89;                     // mov %ecx, (%rax)
      *(tc_pos++) = 0x08;
    } else if (program[i].type == IMM_NAND) {
      *(tc_pos++) = 0x48;                     // mov &acc, %rax
      *(tc_pos++) = 0xb8;
      *((uint32_t**)tc_pos) = acc_addr;       tc_pos+=8;
      *(tc_pos++) = 0x81;                     // and (%rax), arg
      *(tc_pos++) = 0x20;
      *((uint32_t*)tc_pos) = program[i].arg;  tc_pos+=4;
      *(tc_pos++) = 0xf7;                     // not (%rax) f7 /2
      *(tc_pos++) = 0x10;
    } else if (program[i].type == MEM_ADD) {
      *(tc_pos++) = 0x48;                     // mov &acc, %rax
      *(tc_pos++) = 0xb8;
      *((uint32_t**)tc_pos) = acc_addr;       tc_pos+=8;
      *(tc_pos++) = 0x48;                     // mov (mem_base+arg), %rcx
      *(tc_pos++) = 0xb9;
      *((uint32_t**)tc_pos) = &mem_base[program[i].arg]; tc_pos += 8;
      *(tc_pos++) = 0x8b;                     // mov (%rcx), %ecx
      *(tc_pos++) = 0x09;
      *(tc_pos++) = 0x01;                     // addl %ecx, (%rax)
      *(tc_pos++) = 0x08;
    } else {
      // movq [argument], %rdi
      *(tc_pos++) = 0x48;
      *(tc_pos++) = 0xbf;
      *(uint64_t*)(tc_pos) = program[i].arg; tc_pos += 8;

      // mov [func addr], %rax
      *(tc_pos++) = 0x48;
      *(tc_pos++) = 0xb8;
      *(dispatch_ent*)tc_pos = disp_table[program[i].type]; tc_pos += 8;
      // call *(rax)
      *(tc_pos++) = 0xff;
      *(tc_pos++) = 0xd0;
    }

    // inc rbx
    *(tc_pos++) = 0x48;
    *(tc_pos++) = 0xff;
    *(tc_pos++) = 0xc3;

    // If we've hit the end of the basic block, return from the code cache.
    if (program[i].is_branch()) {
      *(tc_pos++) = 0x48; // mov %rbx, %rax
      *(tc_pos++) = 0x89;
      *(tc_pos++) = 0xd8;
      *(tc_pos++) = 0x5b; // pop %rbx
      //*(tc_pos++) = 0xc9; // leave
      *(tc_pos++) = 0xc3; // ret
    }
  }
  *(tc_pos++) = 0x48; // mov %rbx, %rax
  *(tc_pos++) = 0x89;
  *(tc_pos++) = 0xd8;
  *(tc_pos++) = 0x5b; // pop %rbx
  //*(tc_pos++) = 0xc9; // leave
  *(tc_pos++) = 0xc3; // ret

  __m = m;
  uint64_t icount = 0;
  m->next = &program[0];
  while (m->running) {
    Op *current = m->next;
    m->next = 0;
    
    int n = run_code_cache(block_loc[current - program]);
    if (m->next == 0) m->next = current+n;

    if (icount%100000000 > (icount+n)%100000000) {
      std::cout /*<< "n = " << n*/ << "; Inst " << icount+n << ".\n";
    }
 
    icount += n;
  }

  std::cout << icount << " total instructions.\n";

  m->show_state();

  munmap(code_cache, sysconf(_SC_PAGESIZE));

  return 0;
}
