bdd tests: do not query word table directly

author Sarah Hoffmann <lonvia@denofr.de>

Sat, 24 Jul 2021 10:12:31 +0000 (12:12 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Sat, 24 Jul 2021 10:12:31 +0000 (12:12 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php

index 796635eeb7dbebe788e727b59994a9d990aa9dea..9bd9828cb0703ce388108713856c6f176c15feb4 100644 (file)
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -19,7 +19,7 @@ class Tokenizer
  
      public function checkStatus()
      {
-        $sSQL = "SELECT word_id FROM word WHERE word_token == 'a'";
+        $sSQL = "SELECT word_id FROM word limit 1";
          $iWordID = $this->oDB->getOne($sSQL);
          if ($iWordID === false) {
              throw new Exception('Query failed', 703);
@@ -145,10 +145,10 @@ class Tokenizer
      private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
      {
          // Check which tokens we have, get the ID numbers
-        $sSQL = 'SELECT word_id, word_token, type';
+        $sSQL = 'SELECT word_id, word_token, type,';
          $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode,";
          $sSQL .= "      info->>'op' as operator,";
-        $sSQL .= "      info->>'class' as class, info->>'type' as type,";
+        $sSQL .= "      info->>'class' as class, info->>'type' as ctype,";
          $sSQL .= "      info->>'count' as count";
          $sSQL .= ' FROM word WHERE word_token in (';
          $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
@@ -159,66 +159,60 @@ class Tokenizer
  
          foreach ($aDBWords as $aWord) {
              $iId = (int) $aWord['word_id'];
+            $sTok = $aWord['word_token'];
  
              switch ($aWord['type']) {
-                'C':  // country name tokens
-                    if ($aWord['country'] === null
-                        || ($this->aCountryRestriction
-                            && !in_array($aWord['country'], $this->aCountryRestriction))
+                case 'C':  // country name tokens
+                    if ($aWord['country'] !== null
+                        && (!$this->aCountryRestriction
+                            || in_array($aWord['country'], $this->aCountryRestriction))
                      ) {
-                        continue;
+                        $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country']));
                      }
-                    $oToken = new Token\Country($iId, $aWord['country'])
                      break;
-                'H':  // house number tokens
-                    $oToken = new Token\HouseNumber($iId, $aWord['word_token']);
+                case 'H':  // house number tokens
+                    $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
                      break;
-                'P':  // postcode tokens
+                case 'P':  // postcode tokens
                      // Postcodes are not normalized, so they may have content
                      // that makes SQL injection possible. Reject postcodes
                      // that would need special escaping.
-                    if ($aWord['postcode'] === null
-                        || pg_escape_string($aWord['postcode']) == $aWord['postcode']
+                    if ($aWord['postcode'] !== null
+                        && pg_escape_string($aWord['postcode']) == $aWord['postcode']
                      ) {
-                       continue;
+                        $sNormPostcode = $this->normalizeString($aWord['postcode']);
+                        if (strpos($sNormQuery, $sNormPostcode) !== false) {
+                            $oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null));
+                        }
                      }
-                    $sNormPostcode = $this->normalizeString($aWord['postcode']);
-                    if (strpos($sNormQuery, $sNormPostcode) === false) {
-                        continue;
-                    }
-                    $oToken = new Token\Postcode($iId, $aWord['postcode'], null);
                      break;
-                'S':  // tokens for classification terms (special phrases)
-                    if ($aWord['class'] === null || $aWord['type'] === null
-                    ) {
-                        continue;
+                case 'S':  // tokens for classification terms (special phrases)
+                    if ($aWord['class'] !== null && $aWord['ctype'] !== null) {
+                        $oValidTokens->addToken($sTok, new Token\SpecialTerm(
+                            $iId,
+                            $aWord['class'],
+                            $aWord['ctype'],
+                            (isset($aWord['op'])) ? Operator::NEAR : Operator::NONE
+                        ));
                      }
-                    $oToken = new Token\SpecialTerm(
-                        $iId,
-                        $aWord['class'],
-                        $aWord['type'],
-                        $aWord['op'] ? Operator::NEAR : Operator::NONE
-                    );
                      break;
-                'W': // full-word tokens
-                    $oToken = new Token\Word(
+                case 'W': // full-word tokens
+                    $oValidTokens->addToken($sTok, new Token\Word(
                          $iId,
                          (int) $aWord['count'],
                          substr_count($aWord['word_token'], ' ')
-                    );
+                    ));
                      break;
-                'w':  // partial word terms
-                    $oToken = new Token\Partial(
+                case 'w':  // partial word terms
+                    $oValidTokens->addToken($sTok, new Token\Partial(
                          $iId,
                          $aWord['word_token'],
                          (int) $aWord['count']
-                    );
+                    ));
                      break;
                  default:
-                    continue;
+                    break;
              }
-
-            $oValidTokens->addToken($aWord['word_token'], $oToken);
          }
      }
  
diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature

index 6102e99ba1b00925bbd754b700c9e80344736d9f..4c839db00143e004b28f5d98dd6890102475e5e0 100644 (file)
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -134,9 +134,7 @@ Feature: Import of postcodes
          Then location_postcode contains exactly
             | country | postcode | geometry |
             | de      | 01982    | country:de |
-        And word contains
-           | word  | class | type |
-           | 01982 | place | postcode |
+        And there are word tokens for postcodes 01982
  
      Scenario: Different postcodes with the same normalization can both be found
          Given the places
diff --git a/test/bdd/db/update/postcode.feature b/test/bdd/db/update/postcode.feature

index 94550ffd6b3f764a0687a8ae628eb154feacdb75..c2fb30ceb1b1ff479235a4041e4f4187f6e65d1f 100644 (file)
--- a/test/bdd/db/update/postcode.feature
+++ b/test/bdd/db/update/postcode.feature
@@ -18,10 +18,7 @@ Feature: Update of postcode
             | country | postcode | geometry |
             | de      | 01982    | country:de |
             | ch      | 4567     | country:ch |
-        And word contains
-           | word  | class | type |
-           | 01982 | place | postcode |
-           | 4567  | place | postcode |
+        And there are word tokens for postcodes 01982,4567
  
       Scenario: When the last postcode is deleted, it is deleted from postcode and word
          Given the places
@@ -34,12 +31,8 @@ Feature: Update of postcode
          Then location_postcode contains exactly
             | country | postcode | geometry |
             | ch      | 4567     | country:ch |
-        And word contains not
-           | word  | class | type |
-           | 01982 | place | postcode |
-        And word contains
-           | word  | class | type |
-           | 4567  | place | postcode |
+        And there are word tokens for postcodes 4567
+        And there are no word tokens for postcodes 01982
  
       Scenario: A postcode is not deleted from postcode and word when it exist in another country
          Given the places
@@ -52,9 +45,7 @@ Feature: Update of postcode
          Then location_postcode contains exactly
             | country | postcode | geometry |
             | ch      | 01982    | country:ch |
-        And word contains
-           | word  | class | type |
-           | 01982 | place | postcode |
+        And there are word tokens for postcodes 01982
  
       Scenario: Updating a postcode is reflected in postcode table
          Given the places
@@ -68,9 +59,7 @@ Feature: Update of postcode
          Then location_postcode contains exactly
             | country | postcode | geometry |
             | de      | 20453    | country:de |
-        And word contains
-           | word  | class | type |
-           | 20453 | place | postcode |
+        And there are word tokens for postcodes 20453
  
       Scenario: When changing from a postcode type, the entry appears in placex
          When importing
@@ -91,9 +80,7 @@ Feature: Update of postcode
          Then location_postcode contains exactly
             | country | postcode | geometry |
             | de      | 20453    | country:de |
-        And word contains
-           | word  | class | type |
-           | 20453 | place | postcode |
+        And there are word tokens for postcodes 20453
  
       Scenario: When changing to a postcode type, the entry disappears from placex
          When importing
@@ -114,6 +101,4 @@ Feature: Update of postcode
          Then location_postcode contains exactly
             | country | postcode | geometry |
             | de      | 01982    | country:de |
-        And word contains
-           | word  | class | type |
-           | 01982 | place | postcode |
+        And there are word tokens for postcodes 01982
diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py

index b4f0d8532ac39984746aae2911e66ab8564232fd..be2789f326c49c47026bd4113d1aec45f0fa12fd 100644 (file)
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -281,6 +281,39 @@ def check_word_table(context, exclude):
              else:
                  assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values)
  
+
+@then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
+def check_word_table_for_postcodes(context, exclude, postcodes):
+    """ Check that the tokenizer produces postcode tokens for the given
+        postcodes. The postcodes are a comma-separated list of postcodes.
+        Whitespace matters.
+    """
+    nctx = context.nominatim
+    tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config())
+    with tokenizer.name_analyzer() as ana:
+        plist = [ana.normalize_postcode(p) for p in postcodes.split(',')]
+
+    plist.sort()
+
+    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        if nctx.tokenizer == 'legacy_icu':
+            cur.execute("""SELECT info->>'postcode' FROM word
+                           WHERE type = 'P' and info->>'postcode' = any(%s)""",
+                        (plist,))
+        else:
+            cur.execute("""SELECT word FROM word WHERE word = any(%s)
+                             and class = 'place' and type = 'postcode'""",
+                        (plist,))
+
+        found = [row[0] for row in cur]
+        assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}"
+
+    if exclude:
+        assert len(found) == 0, f"Unexpected postcodes: {found}"
+    else:
+        assert set(found) == set(plist), \
+        f"Missing postcodes {set(plist) - set(found)}. Found: {found}"
+
  @then("place_addressline contains")
  def check_place_addressline(context):
      """ Check the contents of the place_addressline table. Each row represents
author	Sarah Hoffmann <lonvia@denofr.de>
	Sat, 24 Jul 2021 10:12:31 +0000 (12:12 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
lib-php/tokenizer/legacy_icu_tokenizer.php		patch \| blob \| history
test/bdd/db/import/postcodes.feature		patch \| blob \| history
test/bdd/db/update/postcode.feature		patch \| blob \| history
test/bdd/steps/steps_db_ops.py		patch \| blob \| history