]> git.openstreetmap.org Git - nominatim.git/commitdiff
bdd tests: do not query word table directly
authorSarah Hoffmann <lonvia@denofr.de>
Sat, 24 Jul 2021 10:12:31 +0000 (12:12 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
The BDD tests cannot make assumptions about the structure of the
word table anymore because it depends on the tokenizer. Use more
abstract descriptions instead that ask for specific kinds of
tokens.

lib-php/tokenizer/legacy_icu_tokenizer.php
test/bdd/db/import/postcodes.feature
test/bdd/db/update/postcode.feature
test/bdd/steps/steps_db_ops.py

index 796635eeb7dbebe788e727b59994a9d990aa9dea..9bd9828cb0703ce388108713856c6f176c15feb4 100644 (file)
@@ -19,7 +19,7 @@ class Tokenizer
 
     public function checkStatus()
     {
-        $sSQL = "SELECT word_id FROM word WHERE word_token == 'a'";
+        $sSQL = "SELECT word_id FROM word limit 1";
         $iWordID = $this->oDB->getOne($sSQL);
         if ($iWordID === false) {
             throw new Exception('Query failed', 703);
@@ -145,10 +145,10 @@ class Tokenizer
     private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
     {
         // Check which tokens we have, get the ID numbers
-        $sSQL = 'SELECT word_id, word_token, type';
+        $sSQL = 'SELECT word_id, word_token, type,';
         $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode,";
         $sSQL .= "      info->>'op' as operator,";
-        $sSQL .= "      info->>'class' as class, info->>'type' as type,";
+        $sSQL .= "      info->>'class' as class, info->>'type' as ctype,";
         $sSQL .= "      info->>'count' as count";
         $sSQL .= ' FROM word WHERE word_token in (';
         $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
@@ -159,66 +159,60 @@ class Tokenizer
 
         foreach ($aDBWords as $aWord) {
             $iId = (int) $aWord['word_id'];
+            $sTok = $aWord['word_token'];
 
             switch ($aWord['type']) {
-                'C':  // country name tokens
-                    if ($aWord['country'] === null
-                        || ($this->aCountryRestriction
-                            && !in_array($aWord['country'], $this->aCountryRestriction))
+                case 'C':  // country name tokens
+                    if ($aWord['country'] !== null
+                        && (!$this->aCountryRestriction
+                            || in_array($aWord['country'], $this->aCountryRestriction))
                     ) {
-                        continue;
+                        $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country']));
                     }
-                    $oToken = new Token\Country($iId, $aWord['country'])
                     break;
-                'H':  // house number tokens
-                    $oToken = new Token\HouseNumber($iId, $aWord['word_token']);
+                case 'H':  // house number tokens
+                    $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
                     break;
-                'P':  // postcode tokens
+                case 'P':  // postcode tokens
                     // Postcodes are not normalized, so they may have content
                     // that makes SQL injection possible. Reject postcodes
                     // that would need special escaping.
-                    if ($aWord['postcode'] === null
-                        || pg_escape_string($aWord['postcode']) == $aWord['postcode']
+                    if ($aWord['postcode'] !== null
+                        && pg_escape_string($aWord['postcode']) == $aWord['postcode']
                     ) {
-                       continue;
+                        $sNormPostcode = $this->normalizeString($aWord['postcode']);
+                        if (strpos($sNormQuery, $sNormPostcode) !== false) {
+                            $oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null));
+                        }
                     }
-                    $sNormPostcode = $this->normalizeString($aWord['postcode']);
-                    if (strpos($sNormQuery, $sNormPostcode) === false) {
-                        continue;
-                    }
-                    $oToken = new Token\Postcode($iId, $aWord['postcode'], null);
                     break;
-                'S':  // tokens for classification terms (special phrases)
-                    if ($aWord['class'] === null || $aWord['type'] === null
-                    ) {
-                        continue;
+                case 'S':  // tokens for classification terms (special phrases)
+                    if ($aWord['class'] !== null && $aWord['ctype'] !== null) {
+                        $oValidTokens->addToken($sTok, new Token\SpecialTerm(
+                            $iId,
+                            $aWord['class'],
+                            $aWord['ctype'],
+                            (isset($aWord['op'])) ? Operator::NEAR : Operator::NONE
+                        ));
                     }
-                    $oToken = new Token\SpecialTerm(
-                        $iId,
-                        $aWord['class'],
-                        $aWord['type'],
-                        $aWord['op'] ? Operator::NEAR : Operator::NONE
-                    );
                     break;
-                'W': // full-word tokens
-                    $oToken = new Token\Word(
+                case 'W': // full-word tokens
+                    $oValidTokens->addToken($sTok, new Token\Word(
                         $iId,
                         (int) $aWord['count'],
                         substr_count($aWord['word_token'], ' ')
-                    );
+                    ));
                     break;
-                'w':  // partial word terms
-                    $oToken = new Token\Partial(
+                case 'w':  // partial word terms
+                    $oValidTokens->addToken($sTok, new Token\Partial(
                         $iId,
                         $aWord['word_token'],
                         (int) $aWord['count']
-                    );
+                    ));
                     break;
                 default:
-                    continue;
+                    break;
             }
-
-            $oValidTokens->addToken($aWord['word_token'], $oToken);
         }
     }
 
index 6102e99ba1b00925bbd754b700c9e80344736d9f..4c839db00143e004b28f5d98dd6890102475e5e0 100644 (file)
@@ -134,9 +134,7 @@ Feature: Import of postcodes
         Then location_postcode contains exactly
            | country | postcode | geometry |
            | de      | 01982    | country:de |
-        And word contains
-           | word  | class | type |
-           | 01982 | place | postcode |
+        And there are word tokens for postcodes 01982
 
     Scenario: Different postcodes with the same normalization can both be found
         Given the places
index 94550ffd6b3f764a0687a8ae628eb154feacdb75..c2fb30ceb1b1ff479235a4041e4f4187f6e65d1f 100644 (file)
@@ -18,10 +18,7 @@ Feature: Update of postcode
            | country | postcode | geometry |
            | de      | 01982    | country:de |
            | ch      | 4567     | country:ch |
-        And word contains
-           | word  | class | type |
-           | 01982 | place | postcode |
-           | 4567  | place | postcode |
+        And there are word tokens for postcodes 01982,4567
 
      Scenario: When the last postcode is deleted, it is deleted from postcode and word
         Given the places
@@ -34,12 +31,8 @@ Feature: Update of postcode
         Then location_postcode contains exactly
            | country | postcode | geometry |
            | ch      | 4567     | country:ch |
-        And word contains not
-           | word  | class | type |
-           | 01982 | place | postcode |
-        And word contains
-           | word  | class | type |
-           | 4567  | place | postcode |
+        And there are word tokens for postcodes 4567
+        And there are no word tokens for postcodes 01982
 
      Scenario: A postcode is not deleted from postcode and word when it exist in another country
         Given the places
@@ -52,9 +45,7 @@ Feature: Update of postcode
         Then location_postcode contains exactly
            | country | postcode | geometry |
            | ch      | 01982    | country:ch |
-        And word contains
-           | word  | class | type |
-           | 01982 | place | postcode |
+        And there are word tokens for postcodes 01982
 
      Scenario: Updating a postcode is reflected in postcode table
         Given the places
@@ -68,9 +59,7 @@ Feature: Update of postcode
         Then location_postcode contains exactly
            | country | postcode | geometry |
            | de      | 20453    | country:de |
-        And word contains
-           | word  | class | type |
-           | 20453 | place | postcode |
+        And there are word tokens for postcodes 20453
 
      Scenario: When changing from a postcode type, the entry appears in placex
         When importing
@@ -91,9 +80,7 @@ Feature: Update of postcode
         Then location_postcode contains exactly
            | country | postcode | geometry |
            | de      | 20453    | country:de |
-        And word contains
-           | word  | class | type |
-           | 20453 | place | postcode |
+        And there are word tokens for postcodes 20453
 
      Scenario: When changing to a postcode type, the entry disappears from placex
         When importing
@@ -114,6 +101,4 @@ Feature: Update of postcode
         Then location_postcode contains exactly
            | country | postcode | geometry |
            | de      | 01982    | country:de |
-        And word contains
-           | word  | class | type |
-           | 01982 | place | postcode |
+        And there are word tokens for postcodes 01982
index b4f0d8532ac39984746aae2911e66ab8564232fd..be2789f326c49c47026bd4113d1aec45f0fa12fd 100644 (file)
@@ -281,6 +281,39 @@ def check_word_table(context, exclude):
             else:
                 assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values)
 
+
+@then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
+def check_word_table_for_postcodes(context, exclude, postcodes):
+    """ Check that the tokenizer produces postcode tokens for the given
+        postcodes. The postcodes are a comma-separated list of postcodes.
+        Whitespace matters.
+    """
+    nctx = context.nominatim
+    tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config())
+    with tokenizer.name_analyzer() as ana:
+        plist = [ana.normalize_postcode(p) for p in postcodes.split(',')]
+
+    plist.sort()
+
+    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        if nctx.tokenizer == 'legacy_icu':
+            cur.execute("""SELECT info->>'postcode' FROM word
+                           WHERE type = 'P' and info->>'postcode' = any(%s)""",
+                        (plist,))
+        else:
+            cur.execute("""SELECT word FROM word WHERE word = any(%s)
+                             and class = 'place' and type = 'postcode'""",
+                        (plist,))
+
+        found = [row[0] for row in cur]
+        assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}"
+
+    if exclude:
+        assert len(found) == 0, f"Unexpected postcodes: {found}"
+    else:
+        assert set(found) == set(plist), \
+        f"Missing postcodes {set(plist) - set(found)}. Found: {found}"
+
 @then("place_addressline contains")
 def check_place_addressline(context):
     """ Check the contents of the place_addressline table. Each row represents