]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/api/search/icu_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / api / search / icu_tokenizer.py
index 06a06f34984e32c033837bf68c362cf34ea272e6..f6590f5b36f87f0ac81f80dd5c1ef7a12fb8a726 100644 (file)
@@ -8,7 +8,6 @@
 Implementation of query analysis for the ICU tokenizer.
 """
 from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
 Implementation of query analysis for the ICU tokenizer.
 """
 from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
-from copy import copy
 from collections import defaultdict
 import dataclasses
 import difflib
 from collections import defaultdict
 import dataclasses
 import difflib
@@ -22,6 +21,7 @@ from nominatim.api.connection import SearchConnection
 from nominatim.api.logging import log
 from nominatim.api.search import query as qmod
 from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
 from nominatim.api.logging import log
 from nominatim.api.search import query as qmod
 from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
+from nominatim.db.sqlalchemy_types import Json
 
 
 DB_TO_TOKEN_TYPE = {
 
 
 DB_TO_TOKEN_TYPE = {
@@ -97,6 +97,7 @@ class ICUToken(qmod.Token):
         """ Create a ICUToken from the row of the word table.
         """
         count = 1 if row.info is None else row.info.get('count', 1)
         """ Create a ICUToken from the row of the word table.
         """
         count = 1 if row.info is None else row.info.get('count', 1)
+        addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
 
         penalty = 0.0
         if row.type == 'w':
 
         penalty = 0.0
         if row.type == 'w':
@@ -121,9 +122,10 @@ class ICUToken(qmod.Token):
         else:
             lookup_word = row.word_token
 
         else:
             lookup_word = row.word_token
 
-        return ICUToken(penalty=penalty, token=row.word_id, count=count,
+        return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count),
                         lookup_word=lookup_word, is_indexed=True,
                         lookup_word=lookup_word, is_indexed=True,
-                        word_token=row.word_token, info=row.info)
+                        word_token=row.word_token, info=row.info,
+                        addr_count=max(1, addr_count))
 
 
 
 
 
 
@@ -159,7 +161,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                      sa.Column('word_token', sa.Text, nullable=False),
                      sa.Column('type', sa.Text, nullable=False),
                      sa.Column('word', sa.Text),
                      sa.Column('word_token', sa.Text, nullable=False),
                      sa.Column('type', sa.Text, nullable=False),
                      sa.Column('word', sa.Text),
-                     sa.Column('info', self.conn.t.types.Json))
+                     sa.Column('info', Json))
 
 
     async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
 
 
     async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
@@ -186,11 +188,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                         if trange.start == 0:
                             query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
                     else:
                         if trange.start == 0:
                             query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
                     else:
-                        query.add_token(trange, qmod.TokenType.QUALIFIER, token)
-                        if trange.start == 0 or trange.end == query.num_token_slots():
-                            token = copy(token)
-                            token.penalty += 0.1 * (query.num_token_slots())
+                        if trange.start == 0 and trange.end == query.num_token_slots():
                             query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
                             query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
+                        else:
+                            query.add_token(trange, qmod.TokenType.QUALIFIER, token)
                 else:
                     query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
 
                 else:
                     query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
 
@@ -263,7 +264,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             if len(part.token) <= 4 and part[0].isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
             if len(part.token) <= 4 and part[0].isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                ICUToken(0.5, 0, 1, part.token, True, part.token, None))
+                                ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
 
 
     def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
 
 
     def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: