chicken-users
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Chicken-users] utf8 and string-ref performance


From: F. Wittenberger
Subject: Re: [Chicken-users] utf8 and string-ref performance
Date: Wed, 24 Nov 2010 17:05:38 +0100

Am Mittwoch, den 24.11.2010, 08:37 -0700 schrieb Alan Post:
> Can anyone point me in the right direction?

I'll paste an example from my code, because an example is sometime
better than a lyrics.

If you where to keep the byte offset (called "start" in the code down
there) with your position objects, you get O(1) access to the next utf8
character.


(define utf8-seek0
  (foreign-lambda*
   integer
   ((scheme-object str)                 ; the utf8 encode object
    (integer sl)                        ; it's length in byte
    (integer start)                     ; byte offset into str
    (integer index)                     ; utf8 char offset into str
    (integer pos)                       ; seek-to position (> index)
    )                                   ; returns next "start" position
   #<<EOF
  unsigned char *s=(unsigned char *)C_c_string(str);
  unsigned char *scan=s+start;
  unsigned char *limit=s+sl;
  if( index < pos) {
    while (index < pos) {
     if( scan >= limit ) {
      return(-1);
      /* raise_error( make3( TLREF(0), NIL_OBJ, make_string("index out
of bounds"), int2fx(pos) ) ); */
     }
     ++index;
     if (*scan < 0x80) scan++;
     else if (*scan < 0xE0) scan+=2;
     else if (*scan < 0xF0) scan+=3;
     else if (*scan < 0xF8) scan+=4;
     else if (*scan < 0xFC) scan+=5;
     else if (*scan < 0xFE) scan+=6;
     else return(-2);
    }
    return(scan-s);
  } else if(index > pos) {
     int size=0;
     while( index > pos ) {
       if( s > limit ) {
        return(-1);
       }
     do {
       size++;
       limit--;
       if( s > limit || size > 6 ) {
        return(-2);
       }
     } while((*limit >= 0x80) && (*limit < 0xC0));
     index--;
    }
    return(limit-s);
  } else {
    return(start);
  }
EOF
))

(define (utf8-seek s start index pos)
  (let ((v (utf8-seek0 s (string-length s) start index pos)))
    (if (fx>= v 0) v
        (raise
         (case v
           ((-1) "index out of bounds")
           ((-2) "bad string")
           (else 'utf8-seek))))))

Here we still pay the O(n) penalty, because we really require random
access and there is to the best of my knowledge no way around, except
that you where to parse the utf8 sequence into a vector of strings of
one utf8 char in size.  (Which looks kinda prohibitive expensive).

(define (utf8-substring str from to)
  (let ((start-offset (utf8-seek str 0 0 from)))
    (substring str start-offset (utf8-seek str start-offset from to))))

(define (utf8-string-ref str index)
  (utf8-string-getc str (utf8-seek str 0 0 index)))

;; Return the character at byte offset 'start' from the source string
;; (e.g., of a string port) and it's length.

(define utf8-string-getc*
  (foreign-lambda*
   integer
   ((scheme-object str) (integer sl) (integer start) ((c-pointer
integer) rsize))
   #<<EOF
  unsigned char *s=C_c_string(str);
  unsigned char *scan=s+start;
  unsigned char *limit=s+sl;
  unsigned int i, size=1, ch;
  if (*scan < 0x80) ch=*scan;
  else if (*scan < 0xE0) {size=2; ch=*scan & 0x1F;}
  else if (*scan < 0xF0) {size=3; ch=*scan & 0x0F;}
  else if (*scan < 0xF8) {size=4; ch=*scan & 0x07;}
  else if (*scan < 0xFC) {size=5; ch=*scan & 0x3;}     
  else if (*scan < 0xFE) {size=6; ch=*scan & 0x1;}
  else return(-1); /* ch=0, raise_error( make3( TLREF(0), NIL_OBJ,
make_string("bad character size"), int2fx(scan-s) ) ); */

  if( scan++ + size > limit )
    return(-2);
/*    raise_error( make3( TLREF(0), NIL_OBJ, make_string("short
character"), int2fx(scan-s) ) );*/
  for(i=size-1; i ;--i) {
    if ((*scan<0x80) || (*scan >= 0xC0))
      return(-3);
/*      raise_error( make3( TLREF(0), NIL_OBJ, make_string("bad byte"),
int2fx(scan-s) ) );*/
    else { ch=(ch<<6) | (*scan++ & 0x3F); }
  }
  *rsize=size;
  return(ch);
EOF
))

(define (utf8-string-getc str start)
  (let-location
   ((size integer))
   (let ((c (utf8-string-getc* str (string-length str) start (location
size))))
     (integer->char c))))

(define open-utf8-input-string
  (let ([make-input-port make-input-port]
        [string-length string-length])
    (lambda (str)
      (let ((index 0))
        (let-location ((size integer))
        (make-input-port
         (lambda ()
           (if (fx< index (string-length str))
               (let ((c (utf8-string-getc*
                         str (string-length str) index (location size))))
                 (set! index (fx+ index size))
                 (integer->char c))
               #!eof))
         (lambda () (fx< index (string-length str)))
         (lambda () #t)
         (lambda ()
           (if (fx< index (string-length str))
               (let ((c (utf8-string-getc*
                         str (string-length str) index (location size))))
                 (integer->char c))
               #!eof))))))))

(define (call-with-utf8-input-string str proc)
  (proc (open-utf8-input-string str)))

(define open-utf8-output-string open-output-string)

(define (call-with-utf8-output-string proc)
  (let ((port (open-utf8-output-string)))
    (proc port)
    (close-output-port port)))

;; Tell the string index of a byte offset into an utf8 encoded string.
;; Reverse to utf8-seek.

(define utf8-tell0
  (foreign-lambda*
   integer
   ((scheme-object str) (integer sl)
    (integer start) (integer index)
    (integer pos))
   #<<EOF
  unsigned char *s=(unsigned char *)C_c_string(str);
  unsigned char *scan=s+start;
  unsigned char *limit=scan+sl;
  if( s+pos > limit ) {
   return(-1);
/*    raise_error( make3( TLREF(0), NIL_OBJ, make_string("index out of
bounds"), int2fx(pos) ) ); */
  } else {
   limit = s+pos;
  }
  while ( scan < limit ) {
   ++index;
   if (*scan < 0x80) scan++;
   else if (*scan < 0xE0) scan+=2;
   else if (*scan < 0xF0) scan+=3;
   else if (*scan < 0xF8) scan+=4;
   else if (*scan < 0xFC) scan+=5;
   else if (*scan < 0xFE) scan+=6;
   else return(-2);
/* raise_error( make3( TLREF(0), NIL_OBJ, make_string("bad string"),
int2fx(scan-s) ) );*/
  }
  return(index);
EOF
))

(define (utf8-tell str start index pos)
  (let ((v (utf8-tell0 str (string-length str) start index pos)))
    (if (fx>= v 0) v
        (raise (case v
                 ((-1) "index out of bounds")
                 ((-2) "bad string")
                 (else 'utf8-tell))))))

(define (utf8-string-length s) (utf8-tell s 0 0 (string-length s)))

;;** Levenshtein

;; See http://www.merriampark.com/ld.htm

(define (lev-init mx)
  (##sys#check-exact mx 'levenshtein-matrix-allocation)
  (cond-expand
   [unsafe]
   [else (when (fx< mx 0)
               (##sys#error 'levenshtein-matrix-allocation
                            "size is negative" mx))])
  (let ((result (make-u32vector (add1 mx))))
    ((foreign-lambda*
      void
      ((u32vector line) (integer m))
      "int *p=line; int i; for(i=0; i<=m; ++i) p[i]=i;")
     result mx)
    result))

(define (lev-dist line m) (u32vector-ref line m))

(define lev-step!
  (foreign-lambda*
   integer
   ((u32vector matrix) (integer m) (integer i)
    (pointer a) (pointer b) (integer o))
   #<<EOF
  int *d_i = (int*) matrix;
  unsigned char *s = (unsigned char *)(a) + o;
  unsigned char *t = (unsigned char *)(b) + o;
  int distance=d_i[0], j, left, cost;

  d_i[0]=i;
  for(j=1; j<=m; ++j) {               /* row loop */
    left = d_i[j];
    /* Step 5 */
    cost = s[i-1]==t[j-1] ? 0 : 1;
    /* Step 6 */
    d_i[j] =  min(min(d_i[j-1]+1, left+1), distance+cost);
    distance = left;
  }
  return(distance);
EOF
))

(define utf8-lev-step0!
  (foreign-lambda*
   integer
   ((u32vector matrix) (integer m) (integer i) (integer ix)
    (scheme-object ai) (integer al)
    (scheme-object bi) (integer o) (integer aoi) (integer boi))
   #<<EOF
   int *d_i = (int*) matrix;
  unsigned char *a = C_c_string(ai);
  unsigned char *b = C_c_string(bi);
  unsigned char *s = (unsigned char *)(a) + aoi;
  unsigned char *t = (unsigned char *)(b) + boi;
  unsigned char *scan = s + ix;
  unsigned char *limit = s + al;
  int size, si, distance=d_i[0], j, left, cost;

  d_i[0]=i+1;
  for(j=0; j<=m-1; ++j) {               /* row loop */
    left = d_i[j+1];
    /* Step 5 */
    cost = 0;
    size = 1;
    if (*scan < 0x80) cost = *scan == t[j] ? 0 : 1;
    else if (*scan < 0xE0) {
      size=2;
      cost = (*scan & 0x1F) == (t[j] & 0x1F) ? 0 : 1;
    } else if (*scan < 0xF0) {
      size=3;
      cost = (*scan & 0x0F) == (t[j] & 0x0F) ? 0 : 1;
    } else if (*scan < 0xF8) {
      size=4;
      cost = (*scan & 0x07) == (t[j] & 0x07) ? 0 : 1;
    } else if (*scan < 0xFC) {
      size=5;
      cost = (*scan & 0x3) == (t[j] & 0x3) ? 0 : 1;
    } else if (*scan < 0xFE) {
      size=6;
      cost = (*scan & 0x1) == (t[j] & 0x1) ? 0 : 1;
    } else return(-1); /* raise_error( make3( TLREF(0), NIL_OBJ,
make_string("bad character size"), int2fx(scan-s) ) ); */

    if( scan + size > limit )
      return(-2);
/*      raise_error( make3( TLREF(0), NIL_OBJ, make_string("short
character"), int2fx(scan-s) ) ); */
     for(si=1; si<size ; ++si) {
       if ((scan[si]<0x80) || (scan[si] >= 0xC0))
         return(-3);
/*         raise_error( make3( TLREF(0), NIL_OBJ, make_string("bad
byte"), int2fx(scan-s) ) ); */
       else {
         cost |= (scan[si] & 0x3F) == (t[j+si] & 0x3F) ? 0 : 1;
       }
     }

    /* Step 6 */
    d_i[j+1] =  min(min(d_i[j]+1, left+1), distance+cost);
    distance = left;
  }
  return(distance);
EOF
))

(define (utf8-lev-step! matrix m i ix a b off aoff boff)
  (let ((v (utf8-lev-step0! matrix m i ix a (string-length a) b off aoff
boff)))
    (if (fx>= v 0) v
        (raise (case v
                 ((-1) "bad character size")
                 ((-2) "short character")
                 ((-3) "bad byte")
                 (else 'utf8-lev-step!))))))

(define (lev-0 a b)
  (do ((m (sub1 (string-length a)) (sub1 m))
       (n (sub1 (string-length b)) (sub1 n)))
      ((or (fx< m 0) (fx< n 0) (not (eqv? (string-ref a m) (string-ref b
n))))
       (do ((m (add1 m) (sub1 m))
            (n (add1 n) (sub1 n))
            (i 0 (add1 i)))
           ((or (fx< m 1) (fx< n 1)
                (not (eqv? (string-ref a i) (string-ref b i))))
            (values i m n))))))

(define (utf8-lev-0 a b)
  (let ((m (utf8-string-length a)) (n (utf8-string-length b)))
    (define (skip-prefix i ai bi m n)
      (if (or (eqv? m 0) (eqv? n 0)
              (not (eqv? (utf8-string-getc a ai) (utf8-string-getc b
bi))))
          (values i ai bi m n)
          (skip-prefix (add1 i)
                       (utf8-seek a ai i (add1 i))
                       (utf8-seek b bi i (add1 i))
                       (sub1 m) (sub1 n))))
    (let loop ((m m) (mi (utf8-seek a (string-length a) m (sub1 m)))
               (n n) (ni (utf8-seek b (string-length b) n (sub1 n))))
      (if (or (eqv? m 0) (eqv? n 0))
          (skip-prefix 0 0 0 m n)
          (let ((mi1 (utf8-seek a mi m (sub1 m)))
                (ni1 (utf8-seek a ni n (sub1 n))))
            (if (eqv? (utf8-string-getc a mi) (utf8-string-getc b ni))
                (loop (sub1 m) mi (sub1 n) ni)
                (skip-prefix 0 0 0 m n)))))))

(define (levenshtein-distance s t)
  (receive
   (off sl tl) (lev-0 s t)
    (cond
     ((eqv? sl 0) tl)
     ((eqv? tl 0) sl)
     (else
      (if (< tl sl)
          (lev-exec (lev-init sl) s t off sl tl)
          (lev-exec (lev-init tl) t s off tl sl))))))

(define (utf8-levenshtein-distance s t)
  (receive
   (off soi toi sl tl) (utf8-lev-0 s t)
   (cond
    ((eqv? sl 0) tl)
    ((eqv? tl 0) sl)
    (else
     (if (< tl sl)
         (utf8-lev-exec (lev-init sl) s t off soi toi sl tl)
         (utf8-lev-exec (lev-init tl) t s off toi soi tl sl))))))

(define (lev-exec matrix s t o m n)
  (do ((i 1 (add1 i)))
      ((> i n) (lev-dist matrix m))
    (lev-step! matrix m i s t o)))

(define (utf8-lev-exec matrix s t o soi toi m n)
  (do ((i 0 (add1 i))
       (ix 0 (utf8-seek s ix i (add1 i))))
      ((>= i n) (lev-dist matrix m))
    (utf8-lev-step! matrix m i ix s t o soi toi)))

(define (lev-exec< matrix s t o m n limit)
  (let loop ((i 1) (distance 0))
    (cond
     ((>= distance limit) #f)
     ((> i n) (>= (lev-dist matrix m) limit))
     (else (loop (add1 i) (lev-step! matrix m i s t o))))))

(define (utf8-lev-exec< matrix s t o soi toi m n limit)
  (let loop ((i 1) (ix 0) (distance 0))
    (cond
     ((>= distance limit) #f)
     ((> i n) (>= (lev-dist matrix m) limit))
     (else (loop (add1 i)
                 (utf8-seek s i ix (add1 i))
                 (utf8-lev-step! matrix m i ix s t o soi toi))))))

(define (levenshtein< s t limit)
  (receive
   (off sl tl) (lev-0 s t)
    (cond
     ((eqv? sl 0) (< tl limit))
     ((eqv? tl 0) (< sl limit))
     (else
      (if (< tl sl)
          (lev-exec< (lev-init sl) s t off sl tl limit)
          (lev-exec< (lev-init tl) t s off tl sl limit))))))

(define (utf8-levenshtein< s t limit)
  (receive
   (off soi toi sl tl) (utf8-lev-0 s t)
    (cond
     ((eqv? sl 0) (< tl limit))
     ((eqv? tl 0) (< sl limit))
     (else
      (if (< tl sl)
          (utf8-lev-exec< (lev-init sl) s t off soi toi sl tl limit)
          (utf8-lev-exec< (lev-init tl) t s off toi soi tl sl
limit))))))






reply via email to

[Prev in Thread] Current Thread [Next in Thread]