diff --git a/m1/perf/main.c b/m1/perf/main.c new file mode 100644 index 0000000..bf3999d --- /dev/null +++ b/m1/perf/main.c @@ -0,0 +1,60 @@ +#include +#include + +#include "compiler.h" + + +#define BUF_SIZE 1000000 + + +static void report(const char *s) +{ + fprintf(stderr, "%s\n", s); +} + + +static void usage(const char *name) +{ + fprintf(stderr, "usage: %s patch-file [loops]\n", name); + exit(1); +} + + +int main(int argc, char **argv) +{ + char buf[BUF_SIZE]; + const char *name; + FILE *file; + size_t got; + int loops = 1; + int i; + + switch (argc) { + case 2: + break; + case 3: + loops = atoi(argv[2]); + break; + default: + usage(*argv); + } + + name = argv[1]; + file = fopen(name, "r"); + if (!file) { + perror(name); + exit(1); + } + got = fread(buf, sizeof(buf)-1, 1, file); + if (got < 0) { + perror(name); + exit(1); + } + buf[got] = 0; + fclose(file); + + for (i = 0; i != loops; i++) + patch_compile(buf, report); + + return 0; +} diff --git a/m1/perf/sched.c b/m1/perf/sched.c new file mode 100644 index 0000000..0123e9a --- /dev/null +++ b/m1/perf/sched.c @@ -0,0 +1,476 @@ +/* + * sched.c - O(n) ... O(n^2) scheduler + * + * Written 2011 by Werner Almesberger + * + * Based on gfpus.c + * Copyright (C) 2007, 2008, 2009, 2010 Sebastien Bourdeauducq + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + + +#define MAX_LATENCY 8 /* maximum latency; okay to make this bigger */ + + +#define Dprintf(...) + + +struct list { + struct list *next, *prev; +}; + + +static struct insn { + struct list more; /* more insns on same schedule */ + struct fpvm_instruction *vm_insn; + struct data_ref { + struct list more; /* more refs sharing the data */ + struct insn *insn; /* insn this is part of */ + struct insn *dep; /* insn we depend on */ + } opa, opb, dest, cond; + int arity; + int latency; + int unresolved; /* number of data refs we need before we can sched */ + int earliest; /* earliest cycle dependencies seen so far are met */ + struct list dependants; /* list of dependencies (constant) */ + int num_dependants; /* number of unresolved dependencies */ +} insns[FPVM_MAXCODELEN]; + + +/* ----- Register initialization ------------------------------------------- */ + + +/* + * Straight from gfpus.c, only with some whitespace changes. + */ + +static void get_registers(struct fpvm_fragment *fragment, + unsigned int *registers) +{ + int i; + union { + float f; + unsigned int n; + } fconv; + + for(i = 0; i < fragment->nbindings; i++) + if (fragment->bindings[i].isvar) + registers[i] = 0; + else { + fconv.f = fragment->bindings[i].b.c; + registers[i] = fconv.n; + } + for(; i < PFPU_REG_COUNT; i++) + registers[i] = 0; +} + + +/* ----- Doubly-linked list ------------------------------------------------ */ + + +/* + * Use naming conventions of include/linux/list.h + */ + +static void list_init(struct list *list) +{ + list->next = list->prev = list; +} + + +static void list_del(struct list *item) +{ + item->prev->next = item->next; + item->next->prev = item->prev; +} + + +static void *list_pop(struct list *list) +{ + struct list *first; + + first = list->next; + if (first == list) + return NULL; + list_del(first); + return first; +} + + +static void list_add_tail(struct list *list, struct list *item) +{ + item->next = list; + item->prev = list->prev; + list->prev->next = item; + list->prev = item; +} + + +static void list_add(struct list *list, struct list *item) +{ + item->next = list->next; + item->prev = list; + list->next->prev = item; + list->next = item; +} + + +static void list_concat(struct list *a, struct list *b) +{ + if (b->next != b) { + a->prev->next = b->next; + b->next->prev = a->prev; + b->prev->next = a; + a->prev = b->prev; + } + list_init(b); +} + + +/* + * Do not delete elements from the list while traversing it with foreach ! + */ + +#define foreach(var, head) \ + for (var = (void *) ((struct list *) (head))->next; \ + (var) != (void *) (head); \ + var = (void *) ((struct list *) (var))->next) + + +/* ----- Register management ----------------------------------------------- */ + + +static struct vm_reg { + struct insn *setter; /* instruction setting it; NULL if none */ + int pfpu_reg; /* underlying PFPU register */ + int refs; /* usage count */ +} *regs; + +static struct list pfpu_regs[PFPU_REG_COUNT]; +static struct list unallocated; /* unallocated registers */ +static int nbindings; /* "public" bindings */ + + +static int reg2idx(int reg) +{ + return reg >= 0 ? reg : nbindings-reg; +} + + +static int alloc_reg(struct insn *setter) +{ + struct list *reg; + int vm_reg, pfpu_reg, vm_idx; + + vm_reg = setter->vm_insn->dest; + if (vm_reg >= 0) + return vm_reg; + reg = list_pop(&unallocated); + if (!reg) + abort(); + pfpu_reg = reg-pfpu_regs; +Dprintf(" alloc reg %d -> %d\n", vm_reg, pfpu_reg); + vm_idx = reg2idx(vm_reg); + regs[vm_idx].setter = setter; + regs[vm_idx].pfpu_reg = pfpu_reg; + regs[vm_idx].refs = setter->num_dependants+1; + return pfpu_reg; +} + + +static void put_reg(struct insn *setter) +{ + int vm_reg, vm_idx; + + if (!setter) + return; + + vm_reg = setter->vm_insn->dest; + if (vm_reg >= 0) + return; + + vm_idx = reg2idx(vm_reg); + if (--regs[vm_idx].refs) + return; + +Dprintf(" free reg %d\n", regs[vm_idx].pfpu_reg); + /* + * Prepend so that register numbers stay small and bugs reveal + * themselves more rapidly. + */ + list_add(&unallocated, pfpu_regs+regs[vm_idx].pfpu_reg); + + /* clear it for style only */ + regs[vm_idx].setter = NULL; + regs[vm_idx].pfpu_reg = 0; +} + + +static int lookup_pfpu_reg(int vm_reg) +{ + return vm_reg >= 0 ? vm_reg : regs[reg2idx(vm_reg)].pfpu_reg; +} + + +static void init_registers(struct fpvm_fragment *fragment, + unsigned int *registers) +{ + size_t regs_size; + int i; + + get_registers(fragment, registers); + nbindings = fragment->nbindings; + + regs_size = sizeof(struct vm_reg)*(nbindings-fragment->next_sur); + regs = malloc(regs_size); + memset(regs, 0, regs_size); + + list_init(&unallocated); + for (i = fragment->nbindings; i != PFPU_REG_COUNT; i++) + list_add_tail(&unallocated, pfpu_regs+i); + +/* + * @@@ the rules are more complex, see use of dont_touch in + * init_scheduler_state + */ +} + + +/* ----- Instruction scheduler --------------------------------------------- */ + + +static struct list unscheduled; /* unscheduled insns */ +static struct list waiting; /* insns waiting to be scheduled */ +static struct list ready[PFPU_PROGSIZE]; /* insns ready at nth cycle */ +static struct insn *exits[PFPU_PROGSIZE+MAX_LATENCY]; + /* insn writing at nth cycle */ +static struct insn dummy_insn; /* dummy, to signal occupancy */ + + +static struct vm_reg *add_data_ref(struct insn *insn, struct data_ref *ref, + int reg_num) +{ + struct vm_reg *reg; + + reg = regs+reg2idx(reg_num); + ref->insn = insn; + ref->dep = reg->setter; + if (ref->dep) { + list_add_tail(&ref->dep->dependants, &ref->more); + ref->dep->num_dependants++; + insn->unresolved++; +Dprintf("insn %lu: reg %d setter %lu unresolved %d\n", + insn-insns, reg_num, reg->setter-insns, insn->unresolved); + } else { + list_init(&ref->more); + } + return reg; +} + + +int catch = 0; +static void init_scheduler(struct fpvm_fragment *frag) +{ + int i; + struct insn *insn; + + list_init(&unscheduled); + list_init(&waiting); + for (i = 0; i != PFPU_PROGSIZE; i++) + list_init(&ready[i]); + +#if 0 +if (frag->ninstructions > 10) { + frag->ninstructions = 10; +catch = 1; +} +#endif + for (i = 0; i != frag->ninstructions; i++) { + insn = insns+i; + memset(insn, 0, sizeof(struct insn)); + insn->vm_insn = frag->code+i; + insn->arity = fpvm_get_arity(frag->code[i].opcode); + insn->latency = pfpu_get_latency(frag->code[i].opcode); + list_init(&insn->dependants); + switch (insn->arity) { + case 3: + add_data_ref(insn, &insn->opb, FPVM_REG_IFB); + /* fall through */ + case 2: + add_data_ref(insn, &insn->opb, frag->code[i].opb); + /* fall through */ + case 1: + add_data_ref(insn, &insn->opa, frag->code[i].opa); + /* fall through */ + case 0: + add_data_ref(insn, + &insn->dest, frag->code[i].dest)->setter = insn; + break; + default: + abort(); + } + if (insn->unresolved) + list_add_tail(&unscheduled, &insn->more); + else + list_add_tail(&ready[0], &insn->more); + + } + + /* + * We add a few dummy instructions at the end so that we don't need to + * check array boundaries for the unlikely case of overrunning the + * schedule. + */ + for (i = 0; i != PFPU_PROGSIZE; i++) + exits[i] = NULL; + for (; i != PFPU_PROGSIZE+MAX_LATENCY; i++) + exits[i] = &dummy_insn; +} + + +static unsigned issue(struct insn *insn, int cycle) +{ + pfpu_instruction code; + struct data_ref *ref; + int end; +int nada = 0; + end = cycle+insn->latency; + exits[end] = insn; + +Dprintf("cycle %d: insn %lu L %d (A %d B %d)\n", + cycle, insn-insns, insn->latency, insn->vm_insn->opa, insn->vm_insn->opb); + switch (insn->arity) { + case 3: + /* fall through */ + case 2: + code.i.opb = lookup_pfpu_reg(insn->vm_insn->opb); + put_reg(insn->opb.dep); + /* fall through */ + case 1: + code.i.opa = lookup_pfpu_reg(insn->vm_insn->opa); + put_reg(insn->opa.dep); + break; + case 0: + break; + default: + abort(); + } + + code.i.dest = alloc_reg(insn); + code.i.opcode = fpvm_to_pfpu(insn->vm_insn->opcode); + + foreach (ref, &insn->dependants) { + if (ref->insn->earliest <= end) + ref->insn->earliest = end+1; + if (!--ref->insn->unresolved) { +Dprintf(" unlocked %lu -> %u\n", ref->insn-insns, ref->insn->earliest); +nada = 0; + list_del(&ref->insn->more); + list_add_tail(ready+ref->insn->earliest, + &ref->insn->more); + } + } +if (nada && catch) *(volatile int *) 0 = 1; + + return code.w; +} + + +static int count(const struct list *list) +{ + int n = 0; + const struct list *p; + + for (p = list->next; p != list; p = p->next) + n++; + return n; +} + + +static int schedule(struct fpvm_fragment *frag, unsigned int *code) +{ + int remaining; + int i, last, end; + struct insn *insn; + + remaining = frag->ninstructions; + for (i = 0; remaining; i++) { + if (i == PFPU_PROGSIZE) + return -1; +Dprintf("@%d --- remaining %d, waiting %d + ready %d = ", i, remaining, + count(&waiting), count(&ready[i])); + list_concat(&waiting, &ready[i]); +Dprintf("%d\n", count(&waiting)); + foreach (insn, &waiting) + if (!exits[i+insn->latency]) { + code[i] = issue(insn, i); + list_del(&insn->more); + remaining--; + break; + } + if (exits[i]) + put_reg(exits[i]); + } + + /* + * Add NOPs to cover unfinished instructions. + */ + last = i; + end = i+MAX_LATENCY; + if (end > PFPU_PROGSIZE) + end = PFPU_PROGSIZE; + while (i != end) { + if (exits[i]) + last = i+1; /* @@@ ? */ + i++; + } + return last; +} + + +int gfpus_schedule(struct fpvm_fragment *frag, unsigned int *code, + unsigned int *reg) +{ + pfpu_instruction vecout; + int res; + + init_registers(frag, reg); + memset(code, 0, PFPU_PROGSIZE*sizeof(*code)); + init_scheduler(frag); + res = schedule(frag, code); + free(regs); + if (res < 0) + return res; + if (frag->vector_mode) + return res; + if (res == PFPU_PROGSIZE) + return -1; + + vecout.w = 0; + vecout.i.opcode = FPVM_OPCODE_VECTOUT; + code[res] = vecout.w; + + return res+1; +} diff --git a/m1/perf/try b/m1/perf/try new file mode 100755 index 0000000..dbf643c --- /dev/null +++ b/m1/perf/try @@ -0,0 +1,6 @@ +#!/bin/sh -x + +M1=`make path` + +make CFLAGS_EXTRA=-DCOMP_DEBUG SCHED=sched.o +gdb --args ./main $M1/flickernoise/patches/*/*Godhead*.fnp