Firebird: Levenshtein-Distanz

3. Code Generator

Hier kann der SQL-Quelltext zur Berechnung der Levenshtein-Distanz nach eigenen Bedürfnissen zusammengestellt werden. Die einzelnen Bezeichner werden auf Länge und doppelte Namen geprüft. Zu lange Bezeichner werden gegebenenfalls auf 31 Zeichen gekürzt. Weitere Prüfungen, insbesondere die Fehlerfreie Ausführung des SQL-Codes, werden nicht vorgenommen. Der generierte SQL-Quelltext sollte auf jeden Fall in einer Testdatenbank validiert werden, bevor er in eine vorhanden Datenbank aufgenommen wird.

SQL-Namen

Domänen

boolescher Wert
32 bit Integer
einzelnes Zeichen
zu untersuchendes Wort
  

temporäre Tabelle

Tabellenname
Feld I
Feld J
Feld D

gespeicherte Prozedur

Prozedurname

Parameter

erstes Wort
zweites Wort
Flag für Damerau-Erweiterung
Ausgabe der Distanz
  

lokale Variablen

Länge erstes Wort
Länge zweites Wort
 
Laufvariable I
Laufvariable J
 
aktuelles Zeichen erstes Wort
aktuelles Zeichen zweites Wort
 
aktuelle Gewichtung
  
 

generierter Code

SET SQL DIALECT 3;

/******************************************************************************/
/****                               Domains                                ****/
/******************************************************************************/

CREATE DOMAIN "dom_BOOL" AS
SMALLINT
NOT NULL
CHECK (VALUE IN (0, 1));

CREATE DOMAIN "dom_INT32" AS
INTEGER;

CREATE DOMAIN "dom_SingleChar" AS
CHAR(1) CHARACTER SET UTF8;

CREATE DOMAIN "dom_Word" AS
VARCHAR(1021) CHARACTER SET UTF8
NOT NULL;

/******************************************************************************/
/****                          Stored Procedures                           ****/
/******************************************************************************/

SET TERM ^ ;

CREATE PROCEDURE "prc_levenshtein" (
    WORD1 TYPE OF "dom_Word",
    WORD2 TYPE OF "dom_Word",
    USE_DAMERAU TYPE OF "dom_BOOL" = 0)
RETURNS (
    DISTANCE TYPE OF "dom_INT32")
AS
BEGIN
  SUSPEND;
END^

SET TERM ; ^

/******************************************************************************/
/****                                Tables                                ****/
/******************************************************************************/

CREATE GLOBAL TEMPORARY TABLE "_temp_levenshtein" (
    I  "dom_INT32" NOT NULL,
    J  "dom_INT32" NOT NULL,
    D  "dom_INT32"
) ON COMMIT PRESERVE ROWS;

/******************************************************************************/
/****                             Primary Keys                             ****/
/******************************************************************************/

ALTER TABLE "_temp_levenshtein" ADD CONSTRAINT "PK_temp_levenshtein" PRIMARY KEY (I, J);

/******************************************************************************/
/****                          Stored Procedures                           ****/
/******************************************************************************/

SET TERM ^ ;

ALTER PROCEDURE "prc_levenshtein" (
    WORD1 TYPE OF "dom_Word",
    WORD2 TYPE OF "dom_Word",
    USE_DAMERAU TYPE OF "dom_BOOL" = 0)
RETURNS (
    DISTANCE TYPE OF "dom_INT32")
AS
DECLARE VARIABLE LEN1 TYPE OF "dom_INT32";
DECLARE VARIABLE LEN2 TYPE OF "dom_INT32";
DECLARE VARIABLE I TYPE OF "dom_INT32";
DECLARE VARIABLE J TYPE OF "dom_INT32";
DECLARE VARIABLE COST TYPE OF "dom_INT32";
DECLARE VARIABLE CHAR_I TYPE OF "dom_SingleChar";
DECLARE VARIABLE CHAR_J TYPE OF "dom_SingleChar";
BEGIN
/*******************************************************************************/
/* Damerau-Levenshtein-Distanz bei Wikipedia                                   */
/* http://de.wikipedia.org/wiki/Levenstein-Distanz#Damerau-Levenshtein-Distanz */
/*******************************************************************************/

  /* Wortlängen zwischenspeichern */
  LEN1 = COALESCE(CHAR_LENGTH(WORD1), 0);
  LEN2 = COALESCE(CHAR_LENGTH(WORD2), 0);

  /* Array initialisieren */
  I = 0;
  WHILE (I <= LEN1)
  DO
  BEGIN
    /* eintragen */
    UPDATE OR INSERT INTO "_temp_levenshtein"
    (I, J, D)
    VALUES
    (:I, 0, :I)
    MATCHING
    (I, J);
    /* nächste Runde */
    I = I + 1;
  END
  J = 0;
  WHILE (J <= LEN2)
  DO
  BEGIN
    /* eintragen */
    UPDATE OR INSERT INTO "_temp_levenshtein"
    (I, J, D)
    VALUES
    (0, :J, :J3)
    MATCHING
    (I, J);
    /* nächste Runde */
    J = J + 1;
  END

  /* Berechnung */
  I = 1;
  WHILE (I <= LEN1)
  DO
  BEGIN
    CHAR_I = SUBSTRING(WORD1 FROM I FOR 1);
    J = 1;
    WHILE (J <= LEN2)
    DO
    BEGIN

      CHAR_J = SUBSTRING(WORD2 FROM J FOR 1);
      IF (CHAR_I = CHAR_J)
      THEN
        COST = 0;
      ELSE
        COST = 1;

      IF (
           (USE_DAMERAU = 1)
           AND
           (I > 1)
           AND
           (J > 1)
           AND
           (CHAR_I = SUBSTRING(WORD2 FROM J - 1 FOR 1))
           AND
           (SUBSTRING(WORD1 FROM I - 1 FOR 1) = CHAR_J)
         )
      THEN /* Damerau-Levenshtein-Distanz */
      BEGIN
        IF (COST = 1)
        THEN /* COST = 1 */
          UPDATE OR INSERT INTO "_temp_levenshtein"
          (I, J, D)
          VALUES
          (
            :I, :J,
            (
              SELECT MIN(D) + 1
                FROM "_temp_levenshtein"
               WHERE (I = :I - 1 AND J = :J    )
                  OR (I = :I     AND J = :J - 1)
                  OR (I = :I - 1 AND J = :J - 1)
                  OR (I = :I - 2 AND J = :J - 2)
            )
          )
          MATCHING
          (I, J);
        ELSE /* COST = 0 */
          UPDATE OR INSERT INTO "_temp_levenshtein"
          (I, J, D)
          VALUES
          (
            :I, :J,
            (
              SELECT MIN(D) FROM
              (
                SELECT D + 1 AS D
                  FROM "_temp_levenshtein"
                 WHERE (I = :I - 1 AND J = :J    )
                    OR (I = :I     AND J = :J - 1)
                UNION ALL
                SELECT D
                  FROM "_temp_levenshtein"
                 WHERE (I = :I - 1 AND J = :J - 1)
                    OR (I = :I - 2 AND J = :J - 2)
              )
            )
          )
          MATCHING
          (I, J);
      END
      ELSE /* Levenshtein-Distanz */
      BEGIN
        IF (COST = 1)
        THEN /* COST = 1 */
          UPDATE OR INSERT INTO "_temp_levenshtein"
          (I, J, D)
          VALUES
          (
            :I, :J,
            (
              SELECT MIN(D) + 1
                FROM "_temp_levenshtein"
               WHERE (I = :I - 1 AND J = :J    )
                  OR (I = :I     AND J = :J - 1)
                  OR (I = :I - 1 AND J = :J - 1)
            )
          )
          MATCHING
          (I, J);
        ELSE /* COST = 0 */
          UPDATE OR INSERT INTO "_temp_levenshtein"
          (I, J, D)
          VALUES
          (
            :I, :J,
            (
              SELECT MIN(D) FROM
              (
                SELECT D + 1 AS D
                  FROM "_temp_levenshtein"
                 WHERE (I = :I - 1 AND J = :J    )
                    OR (I = :I     AND J = :J - 1)
                UNION ALL
                SELECT D
                  FROM "_temp_levenshtein"
                 WHERE (I = :I - 1 AND J = :J - 1)
              )
            )
          )
          MATCHING
          (I, J);
      END
      /* nächste Runde */
      J = J + 1;
    END
    /* nächste Runde */
    I = I + 1;
  END

  /* Ergebnis */
  SELECT D
    FROM "_temp_levenshtein"
   WHERE I = :LEN1 AND J = :LEN2
    INTO :DISTANCE;

END^

SET TERM ; ^

/******************************************************************************/
/****                             Descriptions                             ****/
/******************************************************************************/

UPDATE RDB$FIELDS
SET RDB$DESCRIPTION = 'Boolescher Wert'
WHERE (RDB$FIELD_NAME = 'dom_BOOL');

UPDATE RDB$FIELDS
SET RDB$DESCRIPTION = 'Ganzzahl mit Vorzeichen 32 bit'
WHERE (RDB$FIELD_NAME = 'dom_INT32');

UPDATE RDB$FIELDS
SET RDB$DESCRIPTION = 'Einzelzeichen'
WHERE (RDB$FIELD_NAME = 'dom_SingleChar');

UPDATE RDB$FIELDS
SET RDB$DESCRIPTION = 'Wortfeld'
WHERE (RDB$FIELD_NAME = 'dom_Word');

COMMIT WORK;

/******************************************************************************/
/****                              Privileges                              ****/
/******************************************************************************/

/* Privileges of users */
GRANT ALL ON RDB$FORMATS TO DBUSER WITH GRANT OPTION;
GRANT ALL ON RDB$PAGES TO DBUSER WITH GRANT OPTION;
GRANT ALL ON RDB$ROLES TO DBUSER WITH GRANT OPTION;
GRANT ALL ON "_temp_levenshtein" TO DBUSER WITH GRANT OPTION;
GRANT EXECUTE ON PROCEDURE "prc_levenshtein" TO DBUSER;
GRANT SELECT ON RDB$FORMATS TO PUBLIC;
GRANT SELECT ON RDB$PAGES TO PUBLIC;
GRANT SELECT ON RDB$ROLES TO PUBLIC;

/* Privileges of procedures */
GRANT ALL ON "_temp_levenshtein" TO PROCEDURE "prc_levenshtein";