Merge remote-tracking branch 'upstream/master'

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
diff --combined src/nominatim_api/search/db_search_builder.py

index 0d7487a4c04fac42a0248ab97acacaf2e11c02e0,632270ef04176f394a10e29d9397141bdeb5a457..1fbb7168bb44a963f31e83bfd99f6f534bcf9be5
--- 1/src/nominatim_api/search/db_search_builder.py
--- 2/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@@ -42,7 -42,7 +42,7 @@@ def build_poi_search(category: List[Tup
       class _PoiData(dbf.SearchData):
           penalty = 0.0
           qualifiers = dbf.WeightedCategories(category, [0.0] * len(category))
-         countries=ccs
+         countries = ccs
   
       return dbs.PoiSearch(_PoiData())
   
@@@ -55,15 -55,13 +55,13 @@@ class SearchBuilder
           self.query = query
           self.details = details
   
- 
       @property
       def configured_for_country(self) -> bool:
           """ Return true if the search details are configured to
               allow countries in the result.
           """
           return self.details.min_rank <= 4 and self.details.max_rank >= 4 \
-                and self.details.layer_enabled(DataLayer.ADDRESS)
- 
+             and self.details.layer_enabled(DataLayer.ADDRESS)
   
       @property
       def configured_for_postcode(self) -> bool:
@@@ -71,8 -69,7 +69,7 @@@
               allow postcodes in the result.
           """
           return self.details.min_rank <= 5 and self.details.max_rank >= 11\
-                and self.details.layer_enabled(DataLayer.ADDRESS)
- 
+             and self.details.layer_enabled(DataLayer.ADDRESS)
   
       @property
       def configured_for_housenumbers(self) -> bool:
@@@ -80,8 -77,7 +77,7 @@@
               allow addresses in the result.
           """
           return self.details.max_rank >= 30 \
-                and self.details.layer_enabled(DataLayer.ADDRESS)
- 
+             and self.details.layer_enabled(DataLayer.ADDRESS)
   
       def build(self, assignment: TokenAssignment) -> Iterator[dbs.AbstractSearch]:
           """ Yield all possible abstract searches for the given token assignment.
@@@ -92,7 -88,7 +88,7 @@@
   
           near_items = self.get_near_items(assignment)
           if near_items is not None and not near_items:
-             return # impossible compbination of near items and category parameter
+             return  # impossible combination of near items and category parameter
   
           if assignment.name is None:
               if near_items and not sdata.postcodes:
@@@ -123,7 -119,6 +119,6 @@@
                   search.penalty += assignment.penalty
                   yield search
   
- 
       def build_poi_search(self, sdata: dbf.SearchData) -> Iterator[dbs.AbstractSearch]:
           """ Build abstract search query for a simple category search.
               This kind of search requires an additional geographic constraint.
@@@ -132,7 -127,6 +127,6 @@@
              and ((self.details.viewbox and self.details.bounded_viewbox) or self.details.near):
               yield dbs.PoiSearch(sdata)
   
- 
       def build_special_search(self, sdata: dbf.SearchData,
                                address: List[TokenRange],
                                is_category: bool) -> Iterator[dbs.AbstractSearch]:
@@@ -157,7 -151,6 +151,6 @@@
                   penalty += 0.2
               yield dbs.PostcodeSearch(penalty, sdata)
   
- 
       def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
                                    address: List[TokenRange]) -> Iterator[dbs.AbstractSearch]:
           """ Build a simple address search for special entries where the
@@@ -167,7 -160,7 +160,7 @@@
           expected_count = sum(t.count for t in hnrs)
   
           partials = {t.token: t.addr_count for trange in address
-                        for t in self.query.get_partials_list(trange)}
+                     for t in self.query.get_partials_list(trange)}
   
           if not partials:
               # can happen when none of the partials is indexed
@@@ -190,7 -183,6 +183,6 @@@
           sdata.housenumbers = dbf.WeightedStrings([], [])
           yield dbs.PlaceSearch(0.05, sdata, expected_count)
   
- 
       def build_name_search(self, sdata: dbf.SearchData,
                             name: TokenRange, address: List[TokenRange],
                             is_category: bool) -> Iterator[dbs.AbstractSearch]:
@@@ -205,14 -197,13 +197,13 @@@
                   sdata.lookups = lookup
                   yield dbs.PlaceSearch(penalty + name_penalty, sdata, count)
   
- 
-     def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
-                           -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
+     def yield_lookups(self, name: TokenRange, address: List[TokenRange]
+                       ) -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
           """ Yield all variants how the given name and address should best
               be searched for. This takes into account how frequent the terms
               are and tries to find a lookup that optimizes index use.
           """
-         penalty = 0.0 # extra penalty
+         penalty = 0.0  # extra penalty
           name_partials = {t.token: t for t in self.query.get_partials_list(name)}
   
           addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
@@@ -224,14 -215,14 +215,14 @@@
               yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
               return
   
- -        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
+ +        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
           # Partial term to frequent. Try looking up by rare full names first.
           name_fulls = self.query.get_tokens(name, TokenType.WORD)
           if name_fulls:
               fulls_count = sum(t.count for t in name_fulls)
   
- -            if fulls_count < 50000 or addr_count < 30000:
+ +            if fulls_count < 80000 or addr_count < 50000:
-                 yield penalty,fulls_count / (2**len(addr_tokens)), \
+                 yield penalty, fulls_count / (2**len(addr_tokens)), \
                       self.get_full_name_ranking(name_fulls, addr_partials,
                                                  fulls_count > 30000 / max(1, len(addr_tokens)))
   
@@@ -241,9 -232,8 +232,8 @@@
           if exp_count < 10000 and addr_count < 20000:
               penalty += 0.35 * max(1 if name_fulls else 0.1,
                                     5 - len(name_partials) - len(addr_tokens))
-             yield penalty, exp_count,\
-                   self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
- 
+             yield penalty, exp_count, \
+                 self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
   
       def get_name_address_ranking(self, name_tokens: List[int],
                                    addr_partials: List[Token]) -> List[dbf.FieldLookup]:
@@@ -268,7 -258,6 +258,6 @@@
   
           return lookup
   
- 
       def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
                                 use_lookup: bool) -> List[dbf.FieldLookup]:
           """ Create a ranking expression with full name terms and
@@@ -280,7 -269,12 +269,7 @@@
           # This might yield wrong results, nothing we can do about that.
           if use_lookup:
               addr_restrict_tokens = []
- -            addr_lookup_tokens = []
- -            for t in addr_partials:
- -                if t.addr_count > 20000:
- -                    addr_restrict_tokens.append(t.token)
- -                else:
- -                    addr_lookup_tokens.append(t.token)
+ +            addr_lookup_tokens = [t.token for t in addr_partials]
           else:
               addr_restrict_tokens = [t.token for t in addr_partials]
               addr_lookup_tokens = []
@@@ -288,7 -282,6 +277,6 @@@
           return dbf.lookup_by_any_name([t.token for t in name_fulls],
                                         addr_restrict_tokens, addr_lookup_tokens)
   
- 
       def get_name_ranking(self, trange: TokenRange,
                            db_field: str = 'name_vector') -> dbf.FieldRanking:
           """ Create a ranking expression for a name term in the given range.
@@@ -301,7 -294,6 +289,6 @@@
           default = sum(t.penalty for t in name_partials) + 0.2
           return dbf.FieldRanking(db_field, default, ranks)
   
- 
       def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
           """ Create a list of ranking expressions for an address term
               for the given ranges.
@@@ -310,7 -302,7 +297,7 @@@
           heapq.heappush(todo, (0, trange.start, dbf.RankedTokens(0.0, [])))
           ranks: List[dbf.RankedTokens] = []
   
-         while todo: # pylint: disable=too-many-nested-blocks
+         while todo:
               neglen, pos, rank = heapq.heappop(todo)
               for tlist in self.query.nodes[pos].starting:
                   if tlist.ttype in (TokenType.PARTIAL, TokenType.WORD):
@@@ -349,7 -341,6 +336,6 @@@
   
           return dbf.FieldRanking('nameaddress_vector', default, ranks)
   
- 
       def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchData]:
           """ Collect the tokens for the non-name search fields in the
               assignment.
@@@ -396,7 -387,6 +382,6 @@@
   
           return sdata
   
- 
       def get_country_tokens(self, trange: TokenRange) -> List[Token]:
           """ Return the list of country tokens for the given range,
               optionally filtered by the country list from the details
@@@ -408,7 -398,6 +393,6 @@@
   
           return tokens
   
- 
       def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
           """ Return the list of qualifier tokens for the given range,
               optionally filtered by the qualifier list from the details
@@@ -420,7 -409,6 +404,6 @@@
   
           return tokens
   
- 
       def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
           """ Collect tokens for near items search or use the categories
               requested per parameter.
diff --combined src/nominatim_api/search/icu_tokenizer.py

index c2a265105a69d08eb3d7d8a75331e4a8c4d61dc9,fa14531aed0d6c07cf79c277255324495b1b063d..c18dd8be62ed1190284e9c0751464b5e54091a47
--- 1/src/nominatim_api/search/icu_tokenizer.py
--- 2/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@@ -48,6 -48,7 +48,7 @@@ class QueryPart(NamedTuple)
   QueryParts = List[QueryPart]
   WordDict = Dict[str, List[qmod.TokenRange]]
   
+ 
   def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
       """ Return all combinations of words in the terms list after the
           given position.
@@@ -72,7 -73,6 +73,6 @@@ class ICUToken(qmod.Token)
           assert self.info
           return self.info.get('class', ''), self.info.get('type', '')
   
- 
       def rematch(self, norm: str) -> None:
           """ Check how well the token matches the given normalized string
               and add a penalty, if necessary.
@@@ -91,7 -91,6 +91,6 @@@
                   distance += abs((ato-afrom) - (bto-bfrom))
           self.penalty += (distance/len(self.lookup_word))
   
- 
       @staticmethod
       def from_db_row(row: SaRow) -> 'ICUToken':
           """ Create a ICUToken from the row of the word table.
@@@ -128,16 -127,13 +127,13 @@@
                           addr_count=max(1, addr_count))
   
   
- 
   class ICUQueryAnalyzer(AbstractQueryAnalyzer):
       """ Converter for query strings into a tokenized query
           using the tokens created by a ICU tokenizer.
       """
- 
       def __init__(self, conn: SearchConnection) -> None:
           self.conn = conn
   
- 
       async def setup(self) -> None:
           """ Set up static data structures needed for the analysis.
           """
@@@ -163,7 -159,6 +159,6 @@@
                        sa.Column('word', sa.Text),
                        sa.Column('info', Json))
   
- 
       async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
           """ Analyze the given list of phrases and return the
               tokenized query.
@@@ -202,20 -197,13 +197,18 @@@
   
           return query
   
- 
       def normalize_text(self, text: str) -> str:
           """ Bring the given text into a normalized form. That is the
               standardized form search will work with. All information removed
               at this stage is inevitably lost.
           """
- -        return cast(str, self.normalizer.transliterate(text))
+ +        norm = cast(str, self.normalizer.transliterate(text))
+ +        numspaces = norm.count(' ')
+ +        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
+ +            return ''
+ +
+ +        return norm
   
- 
       def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
           """ Transliterate the phrases and split them into tokens.
   
@@@ -248,7 -236,6 +241,6 @@@
   
           return parts, words
   
- 
       async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
           """ Return the token information from the database for the
               given word tokens.
@@@ -256,7 -243,6 +248,6 @@@
           t = self.conn.t.meta.tables['word']
           return await self.conn.execute(t.select().where(t.c.word_token.in_(words)))
   
- 
       def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
           """ Add tokens to query that are not saved in the database.
           """
@@@ -268,7 -254,6 +259,6 @@@
                                            count=1, addr_count=1, lookup_word=part.token,
                                            word_token=part.token, info=None))
   
- 
       def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
           """ Add penalties to tokens that depend on presence of other token.
           """
@@@ -279,8 -264,8 +269,8 @@@
                          and (repl.ttype != qmod.TokenType.HOUSENUMBER
                               or len(tlist.tokens[0].lookup_word) > 4):
                           repl.add_penalty(0.39)
-             elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
-                  and len(tlist.tokens[0].lookup_word) <= 3:
+             elif (tlist.ttype == qmod.TokenType.HOUSENUMBER
+                   and len(tlist.tokens[0].lookup_word) <= 3):
                   if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
                       for repl in node.starting:
                           if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
diff --combined src/nominatim_db/tokenizer/icu_tokenizer.py

index 452bf26ce951c949e2bda958b3fb7e1581134879,83928644a9c3a9964e26af05c81ef061b8cfeb05..19818adb9d3df610ed04ec39b547cf99d5adc590
--- 1/src/nominatim_db/tokenizer/icu_tokenizer.py
--- 2/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@@ -17,7 -17,7 +17,7 @@@ from pathlib import Pat
   from psycopg.types.json import Jsonb
   from psycopg import sql as pysql
   
- from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
+ from ..db.connection import connect, Connection, Cursor, server_version_tuple, \
                               drop_tables, table_exists, execute_scalar
   from ..config import Configuration
   from ..db.sql_preprocessor import SQLPreprocessor
@@@ -32,10 -32,11 +32,11 @@@ DBCFG_TERM_NORMALIZATION = "tokenizer_t
   
   LOG = logging.getLogger()
   
- WORD_TYPES =(('country_names', 'C'),
-              ('postcodes', 'P'),
-              ('full_word', 'W'),
-              ('housenumbers', 'H'))
+ WORD_TYPES = (('country_names', 'C'),
+               ('postcodes', 'P'),
+               ('full_word', 'W'),
+               ('housenumbers', 'H'))
+ 
   
   def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
       """ Create a new instance of the tokenizer provided by this module.
@@@ -54,7 -55,6 +55,6 @@@ class ICUTokenizer(AbstractTokenizer)
           self.data_dir = data_dir
           self.loader: Optional[ICURuleLoader] = None
   
- 
       def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
           """ Set up a new tokenizer for the database.
   
@@@ -70,7 -70,6 +70,6 @@@
               self._setup_db_tables(config)
               self._create_base_indices(config, 'word')
   
- 
       def init_from_project(self, config: Configuration) -> None:
           """ Initialise the tokenizer from the project directory.
           """
@@@ -79,14 -78,12 +78,12 @@@
           with connect(self.dsn) as conn:
               self.loader.load_config_from_db(conn)
   
- 
       def finalize_import(self, config: Configuration) -> None:
           """ Do any required postprocessing to make the tokenizer data ready
               for use.
           """
           self._create_lookup_indices(config, 'word')
   
- 
       def update_sql_functions(self, config: Configuration) -> None:
           """ Reimport the SQL functions for this tokenizer.
           """
@@@ -94,14 -91,12 +91,12 @@@
               sqlp = SQLPreprocessor(conn, config)
               sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
   
- 
       def check_database(self, config: Configuration) -> None:
           """ Check that the tokenizer is set up correctly.
           """
           # Will throw an error if there is an issue.
           self.init_from_project(config)
   
- 
       def update_statistics(self, config: Configuration, threads: int = 2) -> None:
           """ Recompute frequencies for all name words.
           """
@@@ -126,28 -121,29 +121,29 @@@
                                        SELECT unnest(nameaddress_vector) as id, count(*)
                                        FROM search_name GROUP BY id""")
                       cur.execute('CREATE INDEX ON addressword_frequencies(id)')
-                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
-                                                                                INOUT info JSONB)
-                                    AS $$
-                                    DECLARE rec RECORD;
-                                    BEGIN
-                                    IF info is null THEN
-                                      info = '{}'::jsonb;
-                                    END IF;
-                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
-                                    LOOP
-                                      info = info || jsonb_build_object('count', rec.count);
-                                    END LOOP;
-                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
-                                    LOOP
-                                      info = info || jsonb_build_object('addr_count', rec.count);
-                                    END LOOP;
-                                    IF info = '{}'::jsonb THEN
-                                      info = null;
-                                    END IF;
-                                    END;
-                                    $$ LANGUAGE plpgsql IMMUTABLE;
-                                 """)
+                     cur.execute("""
+                         CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
+                                                                     INOUT info JSONB)
+                         AS $$
+                         DECLARE rec RECORD;
+                         BEGIN
+                         IF info is null THEN
+                           info = '{}'::jsonb;
+                         END IF;
+                         FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
+                         LOOP
+                           info = info || jsonb_build_object('count', rec.count);
+                         END LOOP;
+                         FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
+                         LOOP
+                           info = info || jsonb_build_object('addr_count', rec.count);
+                         END LOOP;
+                         IF info = '{}'::jsonb THEN
+                           info = null;
+                         END IF;
+                         END;
+                         $$ LANGUAGE plpgsql IMMUTABLE;
+                         """)
                       LOG.info('Update word table with recomputed frequencies')
                       drop_tables(conn, 'tmp_word')
                       cur.execute("""CREATE TABLE tmp_word AS
@@@ -186,7 -182,6 +182,7 @@@
                                               END) as info
                                       FROM word LEFT JOIN word_frequencies wf
                                            ON word.word_id = wf.id
+ +                                    ORDER BY word_id
                                   """)
                       drop_tables(conn, 'word_frequencies')
   
@@@ -201,8 -196,6 +197,6 @@@
           self._create_lookup_indices(config, 'tmp_word')
           self._move_temporary_word_table('tmp_word')
   
- 
- 
       def _cleanup_housenumbers(self) -> None:
           """ Remove unused house numbers.
           """
@@@ -236,8 -229,6 +230,6 @@@
                                   (list(candidates.values()), ))
                   conn.commit()
   
- 
- 
       def update_word_tokens(self) -> None:
           """ Remove unused tokens.
           """
@@@ -245,7 -236,6 +237,6 @@@
           self._cleanup_housenumbers()
           LOG.warning("Tokenizer house-keeping done.")
   
- 
       def name_analyzer(self) -> 'ICUNameAnalyzer':
           """ Create a new analyzer for tokenizing names and queries
               using this tokinzer. Analyzers are context managers and should
@@@ -265,7 -255,6 +256,6 @@@
           return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
                                  self.loader.make_token_analysis())
   
- 
       def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
           """ Return a list of the `num` most frequent full words
               in the database.
@@@ -277,7 -266,6 +267,6 @@@
                                ORDER BY count DESC LIMIT %s""", (num,))
               return list(s[0].split('@')[0] for s in cur)
   
- 
       def _save_config(self) -> None:
           """ Save the configuration that needs to remain stable for the given
               database as database properties.
@@@ -286,7 -274,6 +275,6 @@@
           with connect(self.dsn) as conn:
               self.loader.save_config_to_db(conn)
   
- 
       def _setup_db_tables(self, config: Configuration) -> None:
           """ Set up the word table and fill it with pre-computed word
               frequencies.
@@@ -310,7 -297,6 +298,6 @@@
               """)
               conn.commit()
   
- 
       def _create_base_indices(self, config: Configuration, table_name: str) -> None:
           """ Set up the word table and fill it with pre-computed word
               frequencies.
@@@ -331,21 -317,21 +318,21 @@@
                                   column_type=ctype)
               conn.commit()
   
- 
       def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
           """ Create additional indexes used when running the API.
           """
           with connect(self.dsn) as conn:
               sqlp = SQLPreprocessor(conn, config)
               # Index required for details lookup.
-             sqlp.run_string(conn, """
+             sqlp.run_string(
+                 conn,
+                 """
                   CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
                     ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
-             """,
-             table_name=table_name)
+                 """,
+                 table_name=table_name)
               conn.commit()
   
- 
       def _move_temporary_word_table(self, old: str) -> None:
           """ Rename all tables and indexes used by the tokenizer.
           """
@@@ -362,8 -348,6 +349,6 @@@
               conn.commit()
   
   
- 
- 
   class ICUNameAnalyzer(AbstractAnalyzer):
       """ The ICU analyzer uses the ICU library for splitting names.
   
@@@ -380,7 -364,6 +365,6 @@@
   
           self._cache = _TokenCache()
   
- 
       def close(self) -> None:
           """ Free all resources used by the analyzer.
           """
@@@ -388,20 -371,17 +372,17 @@@
               self.conn.close()
               self.conn = None
   
- 
       def _search_normalized(self, name: str) -> str:
           """ Return the search token transliteration of the given name.
           """
           return cast(str, self.token_analysis.search.transliterate(name)).strip()
   
- 
       def _normalized(self, name: str) -> str:
           """ Return the normalized version of the given name with all
               non-relevant information removed.
           """
           return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
   
- 
       def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
           """ Return token information for the given list of words.
               If a word starts with # it is assumed to be a full name
@@@ -433,8 -413,7 +414,7 @@@
               part_ids = {r[0]: r[1] for r in cur}
   
           return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
-                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
- 
+             + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
   
       def normalize_postcode(self, postcode: str) -> str:
           """ Convert the postcode to a standardized form.
@@@ -444,7 -423,6 +424,6 @@@
           """
           return postcode.strip().upper()
   
- 
       def update_postcodes_from_db(self) -> None:
           """ Update postcode tokens in the word table from the location_postcode
               table.
@@@ -517,9 -495,6 +496,6 @@@
               with self.conn.cursor() as cur:
                   cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
   
- 
- 
- 
       def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
                                  should_replace: bool) -> None:
           """ Replace the search index for special phrases with the new phrases.
@@@ -549,7 -524,6 +525,6 @@@
           LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
                    len(norm_phrases), added, deleted)
   
- 
       def _add_special_phrases(self, cursor: Cursor,
                                new_phrases: Set[Tuple[str, str, str, str]],
                                existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
@@@ -569,10 -543,9 +544,9 @@@
   
           return added
   
- 
       def _remove_special_phrases(self, cursor: Cursor,
-                              new_phrases: Set[Tuple[str, str, str, str]],
-                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
+                                 new_phrases: Set[Tuple[str, str, str, str]],
+                                 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
           """ Remove all phrases from the database that are no longer in the
               new phrase list.
           """
@@@ -588,7 -561,6 +562,6 @@@
   
           return len(to_delete)
   
- 
       def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
           """ Add default names for the given country to the search index.
           """
@@@ -600,7 -572,6 +573,6 @@@
                                        self.sanitizer.process_names(info)[0],
                                        internal=True)
   
- 
       def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
                                   internal: bool = False) -> None:
           """ Add names for the given country from an already sanitized
@@@ -652,7 -623,6 +624,6 @@@
                             """
                   cur.execute(sql, (country_code, list(new_tokens)))
   
- 
       def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
           """ Determine tokenizer information about the given place.
   
@@@ -675,7 -645,6 +646,6 @@@
   
           return token_info.to_dict()
   
- 
       def _process_place_address(self, token_info: '_TokenInfo',
                                  address: Sequence[PlaceName]) -> None:
           for item in address:
@@@ -688,12 -657,11 +658,11 @@@
               elif item.kind == 'place':
                   if not item.suffix:
                       token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
-             elif not item.kind.startswith('_') and not item.suffix and \
-                  item.kind not in ('country', 'full', 'inclusion'):
+             elif (not item.kind.startswith('_') and not item.suffix and
+                   item.kind not in ('country', 'full', 'inclusion')):
                   token_info.add_address_term(item.kind,
                                               itertools.chain(*self._compute_name_tokens([item])))
   
- 
       def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
           """ Normalize the housenumber and return the word token and the
               canonical form.
@@@ -729,7 -697,6 +698,6 @@@
   
           return result
   
- 
       def _retrieve_full_tokens(self, name: str) -> List[int]:
           """ Get the full name token for the given name, if it exists.
               The name is only retrieved for the standard analyser.
@@@ -750,7 -717,6 +718,6 @@@
   
           return full
   
- 
       def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
           """ Computes the full name and partial name tokens for the given
               dictionary of names.
@@@ -788,7 -754,6 +755,6 @@@
   
           return full_tokens, partial_tokens
   
- 
       def _add_postcode(self, item: PlaceName) -> Optional[str]:
           """ Make sure the normalized postcode is present in the word table.
           """
@@@ -836,11 -801,9 +802,9 @@@ class _TokenInfo
           self.address_tokens: Dict[str, str] = {}
           self.postcode: Optional[str] = None
   
- 
       def _mk_array(self, tokens: Iterable[Any]) -> str:
           return f"{{{','.join((str(s) for s in tokens))}}}"
   
- 
       def to_dict(self) -> Dict[str, Any]:
           """ Return the token information in database importable format.
           """
@@@ -867,13 -830,11 +831,11 @@@
   
           return out
   
- 
       def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
           """ Adds token information for the normalised names.
           """
           self.names = self._mk_array(itertools.chain(fulls, partials))
   
- 
       def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
           """ Extract housenumber information from a list of normalised
               housenumbers.
@@@ -883,7 -844,6 +845,6 @@@
               self.housenumbers.add(hnr)
               self.housenumber_tokens.add(token)
   
- 
       def add_street(self, tokens: Iterable[int]) -> None:
           """ Add addr:street match terms.
           """
@@@ -891,13 -851,11 +852,11 @@@
               self.street_tokens = set()
           self.street_tokens.update(tokens)
   
- 
       def add_place(self, tokens: Iterable[int]) -> None:
           """ Add addr:place search and match terms.
           """
           self.place_tokens.update(tokens)
   
- 
       def add_address_term(self, key: str, partials: Iterable[int]) -> None:
           """ Add additional address terms.
           """
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
		1	2
src/nominatim_api/search/db_search_builder.py	patch \|	diff1 \|	diff2 \|	blob \| history
src/nominatim_api/search/icu_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history
src/nominatim_db/tokenizer/icu_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history