Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / api / search / icu_tokenizer.py
diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py

index b68e8d10eef70816f6cb772da2d7036e8a31693d..76a1a2e5d362688d5388044b511a9d3f0ae4a13c 100644 (file)
--- a/nominatim/api/search/icu_tokenizer.py
+++ b/nominatim/api/search/icu_tokenizer.py
@@ -8,7 +8,6 @@
  Implementation of query analysis for the ICU tokenizer.
  """
  from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
-from copy import copy
  from collections import defaultdict
  import dataclasses
  import difflib
@@ -22,6 +21,7 @@ from nominatim.api.connection import SearchConnection
  from nominatim.api.logging import log
  from nominatim.api.search import query as qmod
  from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
+from nominatim.db.sqlalchemy_types import Json
  
  
  DB_TO_TOKEN_TYPE = {
@@ -101,10 +101,16 @@ class ICUToken(qmod.Token):
          penalty = 0.0
          if row.type == 'w':
              penalty = 0.3
+        elif row.type == 'W':
+            if len(row.word_token) == 1 and row.word_token == row.word:
+                penalty = 0.2 if row.word.isdigit() else 0.3
          elif row.type == 'H':
              penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
              if all(not c.isdigit() for c in row.word_token):
                  penalty += 0.2 * (len(row.word_token) - 1)
+        elif row.type == 'C':
+            if len(row.word_token) == 1:
+                penalty = 0.3
  
          if row.info is None:
              lookup_word = row.word
@@ -153,7 +159,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                       sa.Column('word_token', sa.Text, nullable=False),
                       sa.Column('type', sa.Text, nullable=False),
                       sa.Column('word', sa.Text),
-                     sa.Column('info', self.conn.t.types.Json))
+                     sa.Column('info', Json))
  
  
      async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
@@ -178,13 +184,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                  if row.type == 'S':
                      if row.info['op'] in ('in', 'near'):
                          if trange.start == 0:
-                            query.add_token(trange, qmod.TokenType.CATEGORY, token)
+                            query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
                      else:
-                        query.add_token(trange, qmod.TokenType.QUALIFIER, token)
-                        if trange.start == 0 or trange.end == query.num_token_slots():
-                            token = copy(token)
-                            token.penalty += 0.1 * (query.num_token_slots())
-                            query.add_token(trange, qmod.TokenType.CATEGORY, token)
+                        if trange.start == 0 and trange.end == query.num_token_slots():
+                            query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
+                        else:
+                            query.add_token(trange, qmod.TokenType.QUALIFIER, token)
                  else:
                      query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
  
@@ -201,7 +206,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
              standardized form search will work with. All information removed
              at this stage is inevitably lost.
          """
-        return cast(str, self.normalizer.transliterate(text))
+        norm = cast(str, self.normalizer.transliterate(text))
+        numspaces = norm.count(' ')
+        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
+            return ''
+
+        return norm
  
  
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: