gnugo-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gnugo-devel] mkdfa.c


From: Dave Denholm
Subject: [gnugo-devel] mkdfa.c
Date: 04 Nov 2002 21:33:39 +0000

For what it's worth, here's the version of (what I called) mkdfa.c
to generate a dfa that doesn't merge paths with different histories.

Looking back at it, it increases the number of states by far more than I
had realised, and so is possibly not viable. Don't know if it is
inherent because of the number of different possible routes, or
if it is a bug in my implementation which fails to merge some
possible branches.

Anyway, the sort of thing it generates is

static const state_t state_owl_defendpat[] = {
/*     0 */ {    0, {     0,     0,     0,     0 }},/* 0 */
/*     1 */ {    0, {     0,     2, 67412,     0 }},/* 0 */
/*     2 */ {    0, {     3, 57012, 60268, 67084 }},/* 0 */
/*     3 */ {    0, {     4, 47835, 53815, 57004 }},/* 0 */
/*     4 */ {  422, {     5, 36406, 41989, 47725 }},/* 422 */
/*     5 */ {    0, {     6, 26712, 28691, 35946 }},/* 422 */
/*     6 */ {    0, {     7, 20167, 22127, 22394 }},/* 422 */
/*     7 */ {    0, {     8, 17923, 19513, 20161 }},/* 422 */
/*     8 */ {    0, {     9,  9641, 16804, 17851 }},/* 422 */
/*     9 */ {    0, {    10,  7031,  8973,  9585 }},/* 422 */
/*    10 */ {    0, {    11,  5771,  6650,  7022 }},/* 422 */
/*    11 */ {    0, {    12,  5619,  5640,  5642 }},/* 422 */
/*    12 */ {    0, {    13,  5579,  5579,  5603 }},/* 422 */
/*    13 */ {    0, {    14,  2992,  3600,  5315 }},/* 422 */
/*    14 */ {    0, {    15,   681,   968,   969 }},/* 422 */
/*    15 */ {    0, {    16,   557,   557,   557 }},/* 422 */
/*    16 */ {  391, {    17,   538,   538,   538 }},/* 1530 */
/*    17 */ {    0, {    18,   466,   524,   526 }},/* 1530 */
/*    18 */ {    0, {    19,   465,   465,   465 }},/* 1530 */
/*    19 */ {    0, {    20,   390,   390,   410 }},/* 1530 */
/*    20 */ {    0, {    21,   360,   377,   360 }},/* 1530 */
/*    21 */ {    0, {    22,   324,   324,   324 }},/* 1530 */
/*    22 */ {    0, {    23,    23,    23,    23 }},/* 1530 */
/*    23 */ {    0, {    24,    24,    24,    24 }},/* 1530 */
/*    24 */ {    0, {    25,    25,    25,    25 }},/* 1530 */
/*    25 */ {    0, {    26,    26,    26,    26 }},/* 1530 */
/*    26 */ {    0, {    27,    27,    27,   124 }},/* 1530 */
/*    27 */ {    0, {    28,    28,    28,    28 }},/* 1530 */
/*    28 */ {    0, {    29,    29,    29,    29 }},/* 1530 */
/*    29 */ {    0, {    30,    77,    30,    30 }},/* 1530 */
/*    30 */ {    0, {    31,    31,    31,    31 }},/* 1530 */
/*    31 */ {    0, {    32,    75,    75,    75 }},/* 1530 */
/*    32 */ {    0, {    33,    33,    33,    33 }},/* 1530 */
/*    33 */ {    0, {    34,    74,    74,    74 }},/* 1530 */
/*    34 */ {    0, {    35,    72,    72,    72 }},/* 1530 */
/*    35 */ {    0, {    36,    54,    54,    54 }},/* 1530 */
/*    36 */ {  388, {    37,    37,    37,    37 }},/* 11373 */
/*    37 */ {    0, {    38,    38,    38,    38 }},/* 11373 */
/*    38 */ {    0, {    39,    39,    39,    39 }},/* 11373 */
/*    39 */ {    0, {    40,    40,    40,    40 }},/* 11373 */
/*    40 */ {    0, {    41,    41,    41,    41 }},/* 11373 */
/*    41 */ {    0, {    42,    42,    42,    42 }},/* 11373 */
/*    42 */ {    0, {    43,    43,    43,    43 }},/* 11373 */
/*    43 */ {    0, {    44,    44,    44,    44 }},/* 11373 */
/*    44 */ {    0, {    45,    45,    45,    45 }},/* 11373 */
/*    45 */ {    0, {    46,    46,    46,    46 }},/* 11373 */
/*    46 */ {    0, {    47,    47,    47,    47 }},/* 11373 */
/*    47 */ {    0, {    48,    48,    48,    48 }},/* 11373 */
/*    48 */ {    0, {    49,    49,    49,    49 }},/* 11373 */
/*    49 */ {    0, {    50, -11373, -11373, -11373 }},/* 11373 */
/*    50 */ {    0, {    51,    51,    51,    51 }},/* 11373 */
/*    51 */ {    0, {    52,    52,    52,    52 }},/* 11373 */
/*    52 */ {    0, {    53, -11373, -11373, -11373 }},/* 11373 */
/*    53 */ {17927, { -17928, -17928, -17928, -17928 }},/* 17928 */
/*    54 */ {    0, {    55,    55,    55,    55 }},/* 1530 */
/*    55 */ {    0, {    56,    56,    56,    56 }},/* 1530 */
/*    56 */ {    0, {    57,    57,    57,    57 }},/* 1530 */
/*    57 */ {    0, {    58,    58,    58,    58 }},/* 1530 */
/*    58 */ {    0, {    59,    59,    59,    59 }},/* 1530 */
/*    59 */ {    0, {    60,    60,    60,    60 }},/* 1530 */
/*    60 */ {    0, {    61,    61,    61,    61 }},/* 1530 */
/*    61 */ {    0, {    62,    62,    62,    62 }},/* 1530 */
/*    62 */ {    0, {    63,    63,    63,    63 }},/* 1530 */
/*    63 */ {    0, {    64,    64,    64,    64 }},/* 1530 */
/*    64 */ {    0, {    65,    65,    65,    65 }},/* 1530 */
/*    65 */ {    0, {    66,    66,    66,    66 }},/* 1530 */
/*    66 */ {    0, {    67,    67,    67,    67 }},/* 1530 */
/*    67 */ {    0, {    68, -1530, -1530, -1530 }},/* 1530 */
/*    68 */ {    0, {    69,    69,    69,    69 }},/* 1530 */
/*    69 */ {    0, {    70,    70,    70,    70 }},/* 1530 */
/*    70 */ {    0, {    71, -1530, -1530, -1530 }},/* 1530 */
/*    71 */ {17927, { -17933, -17933, -17933, -17933 }},/* 17933 */
/*    72 */ {    0, {    73, -1530, -1530, -1530 }},/* 1530 */
/*    73 */ {  388, { -11373, -11373, -11373, -11373 }},/* 11373 */
/*    74 */ {    0, {    72,    72,    72,    72 }},/* 1530 */
/*    75 */ {    0, {    76,    76,    76,    76 }},/* 1530 */
/*    76 */ {    0, {    74,    74,    74,    74 }},/* 1530 */
/*    77 */ {  392, {    78,    78,    78,    78 }},/* 8595 */
/*    78 */ {    0, {    79,   122,   122,   122 }},/* 8595 */
/*    79 */ {    0, {    80,    80,    80,    80 }},/* 8595 */
/*    80 */ {    0, {    81,   121,   121,   121 }},/* 8595 */
/*    81 */ {    0, {    82,   119,   119,   119 }},/* 8595 */
/*    82 */ {    0, {    83,   101,   101,   101 }},/* 8595 */
/*    83 */ {  388, {    84,    84,    84,    84 }},/* 11377 */
/*    84 */ {    0, {    85,    85,    85,    85 }},/* 11377 */
/*    85 */ {    0, {    86,    86,    86,    86 }},/* 11377 */
/*    86 */ {    0, {    87,    87,    87,    87 }},/* 11377 */
/*    87 */ {    0, {    88,    88,    88,    88 }},/* 11377 */
/*    88 */ {    0, {    89,    89,    89,    89 }},/* 11377 */
/*    89 */ {    0, {    90,    90,    90,    90 }},/* 11377 */
/*    90 */ {    0, {    91,    91,    91,    91 }},/* 11377 */
/*    91 */ {    0, {    92,    92,    92,    92 }},/* 11377 */
/*    92 */ {    0, {    93,    93,    93,    93 }},/* 11377 */
/*    93 */ {    0, {    94,    94,    94,    94 }},/* 11377 */
/*    94 */ {    0, {    95,    95,    95,    95 }},/* 11377 */
/*    95 */ {    0, {    96,    96,    96,    96 }},/* 11377 */
/*    96 */ {    0, {    97, -11377, -11377, -11377 }},/* 11377 */
/*    97 */ {    0, {    98,    98,    98,    98 }},/* 11377 */
/*    98 */ {    0, {    99,    99,    99,    99 }},/* 11377 */
/*    99 */ {    0, {   100, -11377, -11377, -11377 }},/* 11377 */
/*   100 */ {17927, { -17937, -17937, -17937, -17937 }},/* 17937 */

where the comment on the right is the (head of the) list of
patterns matched up to and including that state. A -ve
number indicates both terminal state and gives the pattern
list.

It still stores the attribute matched at each site. This is
not needed, but I included it so that the engine can check
consistency.

It doesn't plug directly into mkpat. Theres some separation
I also did to simplify the interface. The interface is pretty
obvious, but I guess I ought to collect the rest into a patch.
I'll need to download the latest version before doing that.


Oh yes - it occurred to me that dfa ought to be marked 'const'
in order to reduce size of core files, if nothing else !

dd
-- 
address@hidden          http://www.insignia.com


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
 * This is GNU GO, a Go program. Contact address@hidden, or see       *
 * http://www.gnu.org/software/gnugo/ for more information.          *
 *                                                                   *
 * Copyright 1999, 2000, 2001, 2002 by the Free Software Foundation. *
 *                                                                   *
 * This program is free software; you can redistribute it and/or     *
 * modify it under the terms of the GNU General Public License as    *
 * published by the Free Software Foundation - version 2             *
 *                                                                   *
 * This program is distributed in the hope that it will be useful,   *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of    *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the     *
 * GNU General Public License in file COPYING for more details.      *
 *                                                                   *
 * You should have received a copy of the GNU General Public         *
 * License along with this program; if not, write to the Free        *
 * Software Foundation, Inc., 59 Temple Place - Suite 330,           *
 * Boston, MA 02111, USA.                                            *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * * * * * * * * fast pattern matching with DFA  version 2.9 * * *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

#include "patterns.h"
#include "mkdfa.h"
#include "dfa.h"
#include <assert.h>

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#define gg_assert(x) assert(x)



/* #define DFA_TRACE define this to trace the program */
/* #define DFA_PARANOIAC define this to activate a lot of assert() */
#define DFA_MAX_BOARD 21
#define MAX_ORDER DFA_MAX_BOARD*DFA_MAX_BOARD*4
#define DFA_RESIZE_STEP 20000
#define DFA_INIT_SIZE 250
#ifndef EMPTY
#define EMPTY     0             /* . */
#define WHITE     1             /* O */
#define BLACK     2             /* X */
#endif
#define OUT_BOARD 3             /* # */


/* dfa.h defines the format of dfa data within gnugo.
 * Inside mkdfa, we use different structures,
 * before ouputting them in the public form. In particular,
 * we store a matched-so-far pattern list at each state.
 */

/* An attribute is an integer property stored at each state.
 * We use it as an index into an array of attrib structures.
 * Each contains a pattern number and a link (index) to
 * another entry. Entry 0 is reserved to mean end of list.
 */

typedef struct mk_attrib
{
  int val;
  int next;
}
mk_attrib_t;


/* dfa state.
 * The dfa is an array of these structures. Each has
 * an attribute number and links to the 4 possible
 * subsequent states. Entry 0 is reserved to indicate
 * end. Entry 1 is always the initial state.
 */

typedef struct mk_state
{
  int att;     /* patterns matched at this state */
  int list;    /* all patterns up to this state */
  int depth;   /* for consistency tests */
  int newidx;  /* Used during final pruning */
  int next[4]; /* index of each possible subsequent state */
}
mk_state_t;


/* dfa. Inside mkdfa we don't really need to hold this in
 * a structure since we only ever have one.
 */

typedef struct mk_dfa
{
  /* file header */
  char name[80];
  
  /* transition graph */
  mk_state_t *states;
  int numStates;
  int lastState;
}
mk_dfa_t;


/* We collect the pattern strings initially into
 * an array of these, then sort, then add them
 * to the dfa shortest to longest.
 */

typedef struct
{
  int patno;
  char *str;
} mk_string_t;



/* set by -V
 * bit 0 = general tracing
 * bit 1 = debug conversion to string
 * bit 2 = debug sync product
 * bit 3 = dump intermediate dfa
 */

int dfa_verbose;

/*********************
 *  Private data     *
 *********************/

static mk_dfa_t dfa;               /* The dfa being constructed */

static mk_attrib_t *attribs; /* The attrib array */
static int last_attrib;  /* last used entry in attribs */
static int max_attribs;  /* size of *attribs array */

static mk_string_t *strings;  /* The pattern strings */
static int num_strings;       /* Num. strings in use */
static int max_strings;       /* size of strings array */


/* To be sure that everything was well initialized */
static int dfa_was_initialized = 0;

static int *cache; /* cache used during do_sync_product */


/* convert is a table to convert the colors */
static const int convert[3][4] = {
  {-1, -1, -1, -1},             /* not used */
  {EMPTY, WHITE, BLACK, OUT_BOARD},     /* WHITE */
  {EMPTY, BLACK, WHITE, OUT_BOARD}      /* BLACK */

};


/* convert ATT_* values to the corresponding expected values on the board */
static const char att2val[8] =
  { '.', 'X', 'O', 'x', 'o', ',', 'a', '!' };

#define EXPECTED_VAL(att_val) att2val[att_val]


static int spiral[8][MAX_ORDER];



/************************************************
 *   forward declaration of private functions   *
 ************************************************/
static void clean_dfa(mk_dfa_t *pdfa);
static void resize_dfa(mk_dfa_t *pdfa, int numStates);
static void create_dfa(mk_dfa_t *pdfa, const char *str, int att_val);
static int dfa_size(mk_dfa_t *pdfa);
static void dump_dfa(FILE *f, mk_dfa_t *pdfa, int state);
static void new_dfa(mk_dfa_t *pdfa, const char *name);
static void kill_dfa(mk_dfa_t *pdfa);
static void print_c_dfa(FILE *of, const char *name, mk_dfa_t *pdfa);


/********************************
 * manipulating attributes list *
 ********************************/

/*
 * Test if val is member of the attributes set att
 */

static int
member_att(int att, int val)
{
  int res;

  res = 0;
  while (!res && att != 0) {
    if (attribs[att].val == val)
      return 1;
    att = attribs[att].next;
  }
  return 0;
}


/*
 * Append one new entry to the attrib list, extending it
 * if necessary
 */

static int
append_att(int val, int next)
{
  if (next == 0)
    return val+1;  /* one attrib per pattern made up front */

  if (++last_attrib >= max_attribs)
    attribs = realloc(attribs,
                      (max_attribs += 64) * sizeof(mk_attrib_t));

  attribs[last_attrib].val = val;
  attribs[last_attrib].next = next;
  return last_attrib;
}


/* Form a new attribute list by prepending list
 * att1 to att2. The new list flows into att2
 * directly.
 */

static int prepend_att_list(int att1, int att2)
{
  /* New list will start at the next free slot */
  int start = last_attrib+1;
  int a;

  gg_assert(att1 != 0);
  gg_assert(att2 != 0);

  a = start;
  while (att1 != 0) {
    int n = append_att( attribs[att1].val, a+1);
    gg_assert(n==a);
    ++a;
    att1 = attribs[att1].next;
  }

  /* Now fix up the last entry added to point at att2 */
  attribs[a-1].next = att2;
  return start;
}


/* 
 * return the union of two attribute sets att1 & att2,
 * reusing entries where possible (since atts are never
 * changed once added).
 */

static int
union_att(int att1, int att2)
{
  int a;

  if (att1 == 0)
    return att2;

  if (att2 == 0)
    return att1;

  /*
   * Search the attribute array to see if our list already
   * exists. ie see if we can find a sublist starting with
   * the contents of att1 and ending pointing at att2
   * Search backwards, since we are more likely to find the
   * match near the end.
   */

  for (a=last_attrib; a > 0; --a) {
    int a1, b;
    for (a1=att1, b=a;
         a1 != 0 && attribs[a1].val == attribs[b].val;
         a1=attribs[a1].next, b=attribs[b].next)
      ;
      
    /* Now a1 and b are the end of the lists starting
     * att1 and a respectively.
     * If a1 == 0 and b == att2, then a is the target
     * list.
     */
    if (a1 == 0 && b == att2)
      return a;
  }

   
  /* Otherwise, we have to actually make the list */

  return prepend_att_list(att1, att2);
}

/**********************
 * manipulating dfa's *
 **********************/


/*
 * return the effective size of a dfa in kB.
 * Note that this is the size in the output format, not the
 * internal format. It is only approximate, since the internal
 * datastructures may contain unreachable entries which will be
 * pruned before we output them.
 */

static int
dfa_size(mk_dfa_t *pdfa)
{
  int states_size, indexes_size;

  states_size = (pdfa->lastState + 1) * sizeof(state_t);
  indexes_size = (last_attrib + 1) * sizeof(attrib_t);

  return (states_size + indexes_size + sizeof(dfa_t)) / 1024;
}


/* 
 * resize memory for a dfa. All new memory is zeroed
 */

static void
resize_dfa(mk_dfa_t *pdfa, int numStates)
{
  mk_state_t *pBuf;

  if (dfa_verbose > 0)
    fprintf(stderr, "Resizing dfa %s\n", pdfa->name);

  gg_assert(pdfa->lastState < pdfa->numStates);

  pBuf = realloc(pdfa->states, numStates * sizeof(mk_state_t));
  if (pBuf == NULL) {
    fprintf(stderr, "No memory left for dfa: %s", pdfa->name);
    exit(1);
  }

  if (numStates > pdfa->numStates)
    memset(pBuf + pdfa->numStates, 0,
           (numStates - pdfa->numStates) * sizeof(mk_state_t));

  pdfa->states = pBuf;
  pdfa->numStates = numStates;
}

/*
 * Ensure dfa has space for at least newStates more states.
 * Memory for those new states is guaranteed to be zero.
 */

static void
ensure_dfa(mk_dfa_t *pdfa, int newStates)
{
  gg_assert(newStates <= DFA_RESIZE_STEP);

  if (pdfa->lastState + newStates >= pdfa->numStates)
    resize_dfa(pdfa, pdfa->numStates + DFA_RESIZE_STEP);
}


/* 
 * dump a dfa (debugging purpose).
 */

static const char *line =
  "----------------------------------------------------\n";

static void
dump_dfa(FILE *f, mk_dfa_t *pdfa, int state)
{
  int i;
  int att, k;


  if (state == 0) {
    fprintf(f, line);
    fprintf(f, " state  | depth |  .    |   O    |   X    |   #    |  list | 
att \n");
    fprintf(f, line);
  }

  for (i = state+1; i <= pdfa->lastState; i++) {
    mk_state_t *pstate = pdfa->states + i;
    fprintf(f, " %6d |", i);
    fprintf(f, " %6d |", pstate->depth);
    fprintf(f, " %6d | %6d | %6d | %6d |",
            pstate->next[0], pstate->next[1],
            pstate->next[2], pstate->next[3]);
    fprintf(f, " %5d |", pdfa->states[i].list);

    att = pdfa->states[i].att;
    k = 0;
    fprintf(f, " %5d:", att);
    while (att != 0 && k < 10) {
      fprintf(f, " %4d", attribs[att].val);
      att = attribs[att].next;
      k++;
    }
    if (att != 0)
      fprintf(f, " ...");

    fprintf(f, "\n");
  }
  fprintf(f, line);
  fflush(f);
}


/*
 * Reset a dfa
 */

static void
clean_dfa(mk_dfa_t *pdfa)
{
  memset(pdfa->states, 0, pdfa->numStates * sizeof(mk_state_t));
  pdfa->lastState = 1;          /* state 0 is reserveed */
}


/* 
 * allocate memory for a new dfa 
 */

static void
new_dfa(mk_dfa_t *pdfa, const char *name)
{
  memset(pdfa, 0, sizeof(mk_dfa_t));
  pdfa->lastState = -1;  /* avoid assertion in resize */
  resize_dfa(pdfa, DFA_INIT_SIZE);
  clean_dfa(pdfa);
  if (name != NULL)
    strcpy(pdfa->name, name);
  else
    strcpy(pdfa->name, "noname ");

  if (dfa_verbose > 0)
    fprintf(stderr, "dfa %s is born :)\n", pdfa->name);

}

/*
 * free memory used by a dfa
 */

static void
kill_dfa(mk_dfa_t *pdfa)
{
  free(pdfa->states);
  if (dfa_verbose > 0)
    fprintf(stderr, "dfa %s is dead :(\n", pdfa->name);

  memset(pdfa, 0, sizeof(mk_dfa_t));
}





/*
 * print c dfa:
 * print the dfa in c (gnugo) format.
 */

static void
print_c_dfa(FILE *of, const char *name, mk_dfa_t *pdfa)
{
  int i;

  fprintf(of, "\n#include \"dfa.h\"\n");

  fprintf(of, "static const state_t state_%s[%d] = {\n", name, pdfa->lastState 
+ 1);
  for (i = 0; i <= pdfa->lastState; i++) {
    fprintf(of, "/* %5d */ {%5d, { %5d, %5d, %5d, %5d }},/* %d */\n",
            i, pdfa->states[i].att,
            pdfa->states[i].next[0] ? pdfa->states[i].next[0] : 
-(pdfa->states[i].list),
            pdfa->states[i].next[1] ? pdfa->states[i].next[1] : 
-(pdfa->states[i].list),
            pdfa->states[i].next[2] ? pdfa->states[i].next[2] : 
-(pdfa->states[i].list),
            pdfa->states[i].next[3] ? pdfa->states[i].next[3] : 
-(pdfa->states[i].list),
            pdfa->states[i].list);
  }
  fprintf(of, "};\n\n");


  fprintf(of, "static const attrib_t idx_%s[%d] = {\n", name, last_attrib + 1);
  for (i = 0; i <= last_attrib; i++)
    fprintf(of, "/* %3d */ {%d,%d},\n", i, attribs[i].val, attribs[i].next);
  fprintf(of, "};\n\n");

  fprintf(of, "static struct dfa dfa_%s = {\n", name);
  fprintf(of, " \"%s\",\n", name);
  fprintf(of, "state_%s,\n", name);
  fprintf(of, "idx_%s,\n", name);
  fprintf(of, "};\n");

}






/* 
 * Synchronization product between two automata.
 *
 * L(A) is the set of patterns recognized by the automaton A.
 *
 * A syncronized product betwenn two acyclic deterministic automata
 * A1 and A2 is an acyclic deterministic classifier A1xA2 that 
 * recognize and classify the languages 
 * L(A1), L(A2), L(A1 Union A2) and L(A1 Inter A2).
 *
 * This algorithm do the product and the reduction at the same time.
 *
 * See Hopcroft & Ullman "The design and analysis of computer algorithms"
 * Ed. Addison-Wesley, Reading MA, 1974
 * For the theorical aspects.
 *
 *
 * In the general case, we find the sync product of two dfa
 * by traversing the two in parallel, generating a new state
 * for each pair of states.
 *
 * However, this is not an implementation of a generic sync product.
 * It is special cased for adding one pattern to a dfa, and takes
 * advantage of a number of constraints:
 *  - a pattern string is a linear dfa with exactly one attribute
 *    at the terminal node
 *  - strings are added to the dfa in order of length. Thus the
 *    dfa cannot contain states "deeper" than the string. (But strings
 *    can of course be the same depth, which means we may find a terminal
 *    state with an attribute at the same depth as the end of the string.)
 *  - adding a pattern to a dfa is usually a fairly small perturbation.
 *
 *
 * The two dfas are in different formats :
 *
 * The 'left' dfa is the usual array of states.
 *
 * The 'right' dfa is a string.
 * The possible input symbols in the sting are :
 * 
 * '.', ',', '*', '!' for EMPTY expected.
 * 'X'                for BLACK expected.
 * 'O'                for WHITE expected.
 * 'x'                for BLACK|EMPTY expected.
 * 'o'                for WHITE|EMPTY expected.
 * '#', '+', '-', '|' for OUT_BOARD expected.
 * '?'                for EMPTY|BLACK|WHITE expected.
 * '$'                for EMPTY|BLACK|WHITE|OUT_BOARD expected.
 *
 * So for example a "state" of 'x' means next[1] and next[3] are
 * 0,and next[0] and next[2] are the next state.
 * In order to follow the convention that state 0 is the
 * error state, all strings have an illegal character '@'
 * at [0].
 *
 * Before we start, the dfa contains states 1 -> lastState
 * Then during the product, we add new states after lastState.
 * Whenever we fall out of the string, we add a link back
 * to a low-numbered state, and of course all subsequent
 * states are already correct.
 *
 * A cache is used to store states already encountered :
 * because patterns contain wildcards, there are several
 * paths to any state. Because the right dfa is a linear dfa,
 * any state in the left dfa can only be coupled to one state
 * in the right dfa at the same depth. So the left state number
 * is sufficient.
 */


/*
 * This recursive routine fills in entry lastState in the output dfa
 *
 *  state   - state being written
 *  list    - list of patterns matched up to (but excluding) 'left' state
 *  left    - the source state number for this step
 *  string  - the string representing the state table of the right dfa
 *  right   - the state number in the string
 */

static void
do_sync_product(int state, int list, int left,
                const mk_string_t *string, int right)
{
  int c;

  gg_assert(left != 0 || right != 0);

  if (dfa_verbose & 4)
    fprintf(stderr, "[%d] do_sync_product list-%d left=%d right=%d '%s'\n",
            state, list, left, right, string->str + right);

  /* If both are still active, the states must be the same depth */
  gg_assert(left == 0 || right == 0 || dfa.states[left].depth == right);

  if (string->str[right] == 0) {

    /* End of string => we have matched the pattern.
     * We add strings in order of increasing length.
     * So either left == 0 or left is itself a terminal
     * matching one or more patterns.
     * Attributes of output state is pattern + attributes
     * of left state.
     */

    dfa.states[state].att = union_att(dfa.states[left].att,
                                      string->patno+1);

    dfa.states[state].list = union_att(list, dfa.states[state].att);

    dfa.states[state].depth = right;

    gg_assert(dfa.states[left].next[0] == 0);
    gg_assert(dfa.states[left].next[1] == 0);
    gg_assert(dfa.states[left].next[2] == 0);
    gg_assert(dfa.states[left].next[3] == 0);

    gg_assert(left == 0 || dfa.states[left].att != 0);

    return;
  }


  /* Otherwise, right dfa is still active, or has already
   * switched to its error state. In either case, attrib
   * of output state is attrib of left state (which may be
   * the error state). list is union of incoming list and
   * attribute at this state.
   */

  dfa.states[state].att  = dfa.states[left].att;
  dfa.states[state].list = list = union_att(list, dfa.states[left].att);

  dfa.states[state].depth = right ? right : dfa.states[left].depth;

  /* Now scan each possible out-transition */

  for (c = 0; c < 4; c++) {

    /* A table for figuring out whether right dfa
     * remains active after transition c.
     */

    static const char * const lookup[4] = {
      "$?.ox,a!*",
      "$?Oo",
      "$?Xx",
      "$#+-|"
    };

    int nleft = dfa.states[left].next[c];
    int nright = (strchr(lookup[c], string->str[right])) ? right+1 : 0;

    
    if (dfa_verbose & 4)
      fprintf(stderr, "[%d][%d] : %d %d %d\n", state, c, nleft, list, nright);

    if (nright == 0) {

      /* Failed to match the string, so we just fall back into
       * the original part of the table (L,0) = L. (inc. 0 for
       * nleft == 0)
       */
      dfa.states[state].next[c] = nleft;

    } else if (nleft != 0) {

      /* Both dfa and string are active.
       * nleft describes this state uniquely,
       * since nright can only have one value for any nleft.
       */

      int *pcache = cache + nleft * 2;

      gg_assert(dfa.states[nleft].depth == nright);

      if (*pcache == 0) {

        ensure_dfa(&dfa, 1);

        *pcache = ++(dfa.lastState);
        
        /* Recurse to build it. */

        do_sync_product(dfa.lastState,
                        list,
                        nleft, string, nright);
      }

      dfa.states[state].next[c] = *pcache;

    } else {

      /* nleft is 0 and nright is non-zero. list is the set of patterns which
       * matched in the (left) dfa before it entered the error state.
       * What we do next has to take list into account.
       * What we choose to do is generate a complete dfa state sequence for the
       * string, and then cache the *start* of the sequence. This will certainly
       * generate unreachable states, but it does mean that we don't have to
       * cache every pair of (list, nright). We can still share a sequence
       * even if we encounter nright == 5 before nright == 4.
       */

      int *pcache = cache + list * 2 + 1;
      int nstate;

      if (*pcache == 0) {

        /* Need to generate the sequence */

        int len = strlen(string->str);
        mk_state_t *p;
        int i;

        ensure_dfa(&dfa, len);
        *pcache = ++(dfa.lastState);

        p = dfa.states + dfa.lastState;

        for (i=2; i < len; ++i) {
          int n;

          p->att  = 0;
          p->list = list;
          p->depth = i;

          for (n=0; n<4; ++n) {
            if (strchr(lookup[n], string->str[i]))
              p->next[n] = dfa.lastState + 1;
          }

          ++dfa.lastState;
          ++p;
        }

        p->att  = string->patno+1;
        p->list = union_att(list, string->patno + 1);
        p->depth = len;

        if (dfa_verbose & 4)
          fprintf(stderr, "[%d..%d] = string with list=%d\n",
                  *pcache, dfa.lastState, list);
      }

      nstate = *pcache + nright - 2;

      gg_assert(dfa.states[nstate].depth == nright);

      /* The list at the next state will be the same as here,
       * unless we have reached the end of the string.
       */
      gg_assert(string->str[nright] == 0 ||
                dfa.states[nstate].list == list);

      dfa.states[state].next[c] = nstate;
    }

    if (dfa_verbose & 4)
      fprintf(stderr, "[%d][%d] = %d\n",
              state, c, dfa.states[state].next[c]);
  }
}



/* 
 * Init/end functions
 */

void
dfa_init(void)
{
  if (dfa_verbose > 1)
    fprintf(stderr, "dfa: init\n");
  dfa_was_initialized++;
  buildSpiralOrder(spiral);
  new_dfa(&dfa, "master dfa");
  attribs = malloc(64 * sizeof(mk_attrib_t));
  attribs[0].val = -1;
  attribs[0].next = 0;
  last_attrib = 0;
  max_attribs = 64;
}

void
dfa_end(void)
{
  if (dfa_verbose > 1)
    fprintf(stderr, "dfa: end\n");

  kill_dfa(&dfa);
  free(attribs);
  dfa_was_initialized--;
}

/*
 * qsort callback to sort the strings by length.
 * Note - never return 0, since that lets the qsort
 * implementation choose the order.
 */

static int
sort_callback(const void *x, const void *y)
{
  const mk_string_t *a = x;
  const mk_string_t *b = y;

  int lena = strlen(a->str);
  int lenb = strlen(b->str);

  if (lena != lenb)
    return lena - lenb;

  /* If they are the same length, keep them in
   * pattern order
   */

  return a->patno - b->patno;
}


static void
reindex(int state)
{
  int c;

  if (state == 0)
    return;

  if (dfa.states[state].newidx)
    return;

  dfa.states[state].newidx = dfa.lastState++;
  
  for (c=0; c < 4; ++c)
    reindex(dfa.states[state].next[c]);
}


/*
 * Do any final processing.
 *
 * In this implementation, final processing means all processing :
 * we have just been collecting the strings so far.
 */

static void 
dfa_finalize(mk_dfa_t *pdfa)
{
  int j;
  int cache_size = 1024;
  int dumped = 0;

  /* First, sort the strings by length */
  qsort(strings, num_strings, sizeof(mk_string_t), sort_callback);

  clean_dfa(pdfa);

  /* We are almost certainly going to need one attrib for each
   * pattern, so do them all now. Then we know that pattern n
   * is in attrib n+1 without having to search. Do attrib[0] = -1
   * while we are here.
   */

  max_attribs = num_strings+1;
  attribs = malloc( max_attribs * sizeof(mk_attrib_t));
  last_attrib = num_strings;

  for (j=0; j <= num_strings; ++j) {
    attribs[j].val = j-1;
    attribs[j].next = 0;
  }

  cache = calloc(cache_size * 2, sizeof(int));

  dfa.states[1].depth = 1;

  for (j=0; j < num_strings; ++j) {

    /* Make sure the cache is big enough. Need larger
     * of lastState and last_attrib
     */

    int sz = (dfa.lastState > last_attrib) ? (dfa.lastState + 5) : (last_attrib 
+ 5);

    if (sz > cache_size) {
      cache_size = sz;
      cache = realloc(cache, cache_size * 2 * sizeof(int));
    }

    memset(cache, 0, sz * 2 * sizeof(int));

    if (dfa_verbose > 0)
      fprintf(stderr, "Adding pattern %d to dfa : %s\n",
              strings[j].patno, strings[j].str);

    do_sync_product(1, 0, 1, strings + j, 1);

    if (dfa_verbose & 8) {
      /* Since we only append states, no point dumping it from the
       * start each time. (State 1 does change each time, but we can
       * live without it.)
       */
      dump_dfa(stderr, pdfa, dumped);
      dumped = pdfa->lastState;
    }
  }

  free(strings);
  free(cache);


#if 1
  /* Now prune the dead states by traversing the tree
   * to find all the reachable ones. This will also
   * sort the states into a hopefully cache-friendly
   * order, as a side effect.
   *
   * This makes debugging difficult, since it means the
   * dfa used in the engine is not exactly the same one
   * that dump_dfa shows while it's being built. Fortunately,
   * this compression can be turned off without changing
   * the logical dfa.
   */

  {
    int old = dfa.lastState;
    mk_state_t *new;

    dfa.lastState = 1;
    reindex(1);

    /* Make a new state array, and copy all the
     * reachable states into their new slots
     */

    new = calloc( (dfa.lastState+1), sizeof(mk_state_t));

    for (j=1; j <= old; ++j) {
      if (dfa.states[j].newidx) {
        int c;
        mk_state_t *p = new + dfa.states[j].newidx;
        p->att = dfa.states[j].att;
        p->list = dfa.states[j].list;
        for (c=0; c<4; ++c)
          p->next[c] = dfa.states[dfa.states[j].next[c]].newidx;
      }
    }

    /* Free the old states array and attach the new one. */

    free(dfa.states);
    dfa.states = new;
    dfa.numStates = dfa.lastState+1;
  }
#endif

}





/*
 * Build a pattern string from a pattern.
 * str must refer a buffer of size greater than MAX_ORDER.
 */
static void
pattern_2_string(struct pattern *pat, char *str, int trans, int ci, int cj)
{
  char work_space[DFA_MAX_BOARD * 4][DFA_MAX_BOARD * 4];
  int m, n;                     /* anchor position */
  int edges, borders, to_test;
  int i, j, k;
  char c;

  m = DFA_MAX_BOARD * 2 + ci;
  n = DFA_MAX_BOARD * 2 + cj;   /* position of the anchor */

  gg_assert(dfa_was_initialized);
  memset(str, 0, MAX_ORDER);
  memset(work_space, '#', sizeof(work_space));

  /* basic edge constraints */
  for (i = DFA_MAX_BOARD; i != DFA_MAX_BOARD * 3; i++)
    for (j = DFA_MAX_BOARD; j != DFA_MAX_BOARD * 3; j++)
      work_space[i][j] = '$';

  /* pattern mask */
  for (i = pat->mini + m; i != pat->maxi + m + 1; i++)
    for (j = pat->minj + n; j != pat->maxj + n + 1; j++)
      work_space[i][j] = '?';

  /* more advanced edge constraints */

  /* South constraint */
  if (pat->edge_constraints & SOUTH_EDGE) {
    for (i = m + pat->maxi + 1; i != DFA_MAX_BOARD * 3; i++)
      for (j = 0; j != DFA_MAX_BOARD * 3; j++)
        work_space[i][j] = '-';
  }

  /* East constraint */
  if (pat->edge_constraints & EAST_EDGE) {
    for (i = 0; i != DFA_MAX_BOARD * 3; i++)
      for (j = n + pat->maxj + 1; j != DFA_MAX_BOARD * 3; j++)
        work_space[i][j] = '|';
  }
  
  /* North constraint */
  if (pat->edge_constraints & NORTH_EDGE) {
    for (i = 0; i != m + pat->mini; i++)
      for (j = 0; j != DFA_MAX_BOARD * 4; j++)
        work_space[i][j] = '-';
  }

  /* West constraint */
  if (pat->edge_constraints & WEST_EDGE) {
    /* take care not to erase the south edge constraint */
    for (i = 0; i != m + pat->maxi + 1; i++)
      for (j = 0; j != n + pat->minj; j++)
        work_space[i][j] = '|';

    /* complete the last corner only if necessary */
    if (!(pat->edge_constraints & SOUTH_EDGE)) {
      for (i = m + pat->maxi + 1; i != DFA_MAX_BOARD * 3; i++)
        for (j = 0; j != n + pat->minj; j++)
          work_space[i][j] = '|';
    }
  }

  /* dump */
  if (dfa_verbose & 2) {
    for (i = DFA_MAX_BOARD - 1; i != DFA_MAX_BOARD * 3 + 1; i++) {
      for (j = DFA_MAX_BOARD - 1; j != DFA_MAX_BOARD * 3 + 1; j++) {
        if (i == m && j == n)
          fprintf(stderr, "s"); /* mark the anchor */
        else
          fprintf(stderr, "%c", work_space[i][j]);
      }
      fprintf(stderr, "\n");
    }
    fprintf(stderr, "\n");
  }

  /* pattern representation on the work space */
  for (k = 0; k != pat->patlen; k++) {
    c = EXPECTED_VAL(pat->patn[k].att);
    gg_assert(work_space[m + pat->patn[k].x - ci]
              [n + pat->patn[k].y - cj] == '?');
    work_space[m + pat->patn[k].x - ci][n + pat->patn[k].y - cj] = c;
  }

  /* dump */
  if (dfa_verbose & 2) {
    for (i = DFA_MAX_BOARD - 1; i != DFA_MAX_BOARD * 3 + 1; i++) {
      for (j = DFA_MAX_BOARD - 1; j != DFA_MAX_BOARD * 3 + 1; j++) {
        if (i == m && j == n)
          fprintf(stderr, "s"); /* mark the anchor */
        else
          fprintf(stderr, "%c", work_space[i][j]);
      }
      fprintf(stderr, "\n");
    }
    fprintf(stderr, "\n");
  }

  /* Now we can build the smaller pattern string as possible 
   * from the anchor */

  to_test = pat->patlen;        /* How many positions left to test ? */
  edges = pat->edge_constraints;        /* how many constraint tested ? */
  borders = 0xF; 
  /* we must test at least one intersection by border for 
   * patterns like
   * 
   * ???
   * O.O
   * ???
   * 
   * To ensure edge position.
   */

  for (k = 0;
       (k != MAX_ORDER - 1) && ((borders > 0) || edges || to_test > 0);
       k++) {
    j = spiral[trans][k] % (4 * DFA_MAX_BOARD);
    if (j >= 2 * DFA_MAX_BOARD)
      j -= 4 * DFA_MAX_BOARD;
    if (j <  - 2 * DFA_MAX_BOARD)
      j += 4 * DFA_MAX_BOARD;
    i = (spiral[trans][k] - j) / (4 * DFA_MAX_BOARD);

    if (i == pat->maxi)
      borders &= ~SOUTH_EDGE;
    if (i == pat->mini)
      borders &= ~NORTH_EDGE;
    if (j == pat->maxj)
      borders &= ~EAST_EDGE;
    if (j == pat->minj)
      borders &= ~WEST_EDGE;
    
    gg_assert(m + i < DFA_MAX_BOARD * 3 && m + i < DFA_MAX_BOARD * 3);
    str[k] = work_space[m + i][n + j];
    gg_assert(strchr("XOxo.,a!?$#|-+", str[k]));
    
    if (strchr("XOxo.,a!", str[k]))
      to_test--;
    if (strchr("#|-+", str[k])) {
      if (i > pat->maxi)
        edges &= ~SOUTH_EDGE;
      if (i < pat->mini)
        edges &= ~NORTH_EDGE;
      if (j > pat->maxj)
        edges &= ~EAST_EDGE;
      if (j < pat->minj)
        edges &= ~WEST_EDGE;
    }
  }
  
  gg_assert(k < MAX_ORDER);
  str[k] = '\0';                /* end of string */

  if (dfa_verbose > 0)
    fprintf(stderr, "converted pattern into string: %s\n", str);

}

/*
 * Add a pattern (in mkpat format) to the dfa.
 * Actually, in this implementation, we convert the pattern
 * to a string, then sit on the string until all have been
 * collected. The real work is done in dfa_finalize.
 */

void
dfa_add_pattern(struct pattern *pat, int index, int trans, int ci, int cj)
{
  char str[MAX_ORDER];

  if (dfa_verbose > 0)
    fprintf(stderr, "converting pattern %d into string.\n", index);

  pattern_2_string(pat, str, trans, ci, cj);

  if (num_strings >= max_strings)
    strings = realloc(strings, (max_strings += 64) * sizeof(mk_string_t));

  /* Prepend a '@' character to the string. This lets us use
   * consistent values of 0 = error and 1 = starting state.
   */

  strings[num_strings].patno = index;
  strings[num_strings].str = malloc( strlen(str)+2);
  strings[num_strings].str[0] = '@';
  strcpy(strings[num_strings].str+1, str);

  ++num_strings;
}


void dfa_write(FILE *output_FILE, const char *name, int patno)
{
    dfa_finalize(&dfa);

    fprintf(stderr, "---------------------------\n");

    fprintf(stderr, "dfa for %s\n", name);
    fprintf(stderr, "size: %d kB (%d states) for %d patterns\n",
            dfa_size(&dfa), dfa.lastState+1, patno);

    strcpy(dfa.name, name);
    print_c_dfa(output_FILE, name, &dfa);
    fprintf(stderr, "---------------------------\n");

    if (DFA_MAX_MATCHED/8 < patno)
      fprintf(stderr, "Warning: Increase DFA_MAX_MATCHED in 'dfa.h'.\n");

    kill_dfa(&dfa);
}

/*
 * Local Variables:
 * tab-width: 8
 * c-basic-offset: 2
 * End:
 */




reply via email to

[Prev in Thread] Current Thread [Next in Thread]