From 231250f2eb272b77d54e4b4b18bd85a80413ac34 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 29 Sep 2021 10:37:54 +0200 Subject: [PATCH] add wrapper class for place data passed to tokenizer This is mostly for convenience and documentation purposes. --- nominatim/indexer/place_info.py | 44 +++++++++++++++++++++++++ nominatim/indexer/runners.py | 8 +++-- nominatim/tokenizer/base.py | 12 ++----- nominatim/tokenizer/icu_tokenizer.py | 6 ++-- nominatim/tokenizer/legacy_tokenizer.py | 6 ++-- nominatim/tools/tiger_data.py | 5 ++- test/python/dummy_tokenizer.py | 2 ++ test/python/test_tokenizer_icu.py | 20 +++++++---- test/python/test_tokenizer_legacy.py | 15 +++++---- 9 files changed, 84 insertions(+), 34 deletions(-) create mode 100644 nominatim/indexer/place_info.py diff --git a/nominatim/indexer/place_info.py b/nominatim/indexer/place_info.py new file mode 100644 index 00000000..fd179fef --- /dev/null +++ b/nominatim/indexer/place_info.py @@ -0,0 +1,44 @@ +""" +Wrapper around place information the indexer gets from the database and hands to +the tokenizer. +""" + +import psycopg2.extras + +class PlaceInfo: + """ Data class containing all information the tokenizer gets about a + place it should process the names for. + """ + + def __init__(self, info): + self._info = info + + + def analyze(self, analyzer): + """ Process this place with the given tokenizer and return the + result in psycopg2-compatible Json. + """ + return psycopg2.extras.Json(analyzer.process_place(self)) + + + @property + def name(self): + """ A dictionary with the names of the place or None if the place + has no names. + """ + return self._info.get('name') + + + @property + def address(self): + """ A dictionary with the address elements of the place + or None if no address information is available. + """ + return self._info.get('address') + + + @property + def country_feature(self): + """ Return the country code if the place is a valid country boundary. + """ + return self._info.get('country_feature') diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py index 29261ee5..43966419 100644 --- a/nominatim/indexer/runners.py +++ b/nominatim/indexer/runners.py @@ -4,14 +4,16 @@ tasks. """ import functools -import psycopg2.extras from psycopg2 import sql as pysql +from nominatim.indexer.place_info import PlaceInfo + # pylint: disable=C0111 def _mk_valuelist(template, num): return pysql.SQL(',').join([pysql.SQL(template)] * num) + class AbstractPlacexRunner: """ Returns SQL commands for indexing of the placex table. """ @@ -47,7 +49,7 @@ class AbstractPlacexRunner: for place in places: for field in ('place_id', 'name', 'address', 'linked_place_id'): values.append(place[field]) - values.append(psycopg2.extras.Json(self.analyzer.process_place(place))) + values.append(PlaceInfo(place).analyze(self.analyzer)) worker.perform(self._index_sql(len(places)), values) @@ -141,7 +143,7 @@ class InterpolationRunner: values = [] for place in places: values.extend((place[x] for x in ('place_id', 'address'))) - values.append(psycopg2.extras.Json(self.analyzer.process_place(place))) + values.append(PlaceInfo(place).analyze(self.analyzer)) worker.perform(self._index_sql(len(places)), values) diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index 00ecae44..d827f813 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod from typing import List, Tuple, Dict, Any from nominatim.config import Configuration +from nominatim.indexer.place_info import PlaceInfo # pylint: disable=unnecessary-pass @@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC): @abstractmethod - def process_place(self, place: Dict) -> Any: + def process_place(self, place: PlaceInfo) -> Any: """ Extract tokens for the given place and compute the information to be handed to the PL/pgSQL processor for building the search index. Arguments: - place: Dictionary with the information about the place. Currently - the following fields may be present: - - - *name* is a dictionary of names for the place together - with the designation of the name. - - *address* is a dictionary of address terms. - - *country_feature* is set to a country code when the - place describes a country. + place: Place information retrived from the database. Returns: A JSON-serialisable structure that will be handed into diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 5768fd35..81b07568 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -390,18 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): """ token_info = _TokenInfo(self._cache) - names = place.get('name') + names = place.name if names: fulls, partials = self._compute_name_tokens(names) token_info.add_names(fulls, partials) - country_feature = place.get('country_feature') + country_feature = place.country_feature if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): self.add_country_names(country_feature.lower(), names) - address = place.get('address') + address = place.address if address: self._process_place_address(token_info, address) diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 8957426b..8bfb309d 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -405,16 +405,16 @@ class LegacyNameAnalyzer(AbstractAnalyzer): """ token_info = _TokenInfo(self._cache) - names = place.get('name') + names = place.name if names: token_info.add_names(self.conn, names) - country_feature = place.get('country_feature') + country_feature = place.country_feature if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): self.add_country_names(country_feature.lower(), names) - address = place.get('address') + address = place.address if address: self._process_place_address(token_info, address) diff --git a/nominatim/tools/tiger_data.py b/nominatim/tools/tiger_data.py index ff498f77..19a12682 100644 --- a/nominatim/tools/tiger_data.py +++ b/nominatim/tools/tiger_data.py @@ -7,12 +7,11 @@ import logging import os import tarfile -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.async_connection import WorkerPool from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.errors import UsageError +from nominatim.indexer.place_info import PlaceInfo LOG = logging.getLogger() @@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer): address = dict(street=row['street'], postcode=row['postcode']) args = ('SRID=4326;' + row['geometry'], int(row['from']), int(row['to']), row['interpolation'], - psycopg2.extras.Json(analyzer.process_place(dict(address=address))), + PlaceInfo({'address': address}).analyze(analyzer), analyzer.normalize_postcode(row['postcode'])) except ValueError: continue diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py index 69202bc3..db0f32cd 100644 --- a/test/python/dummy_tokenizer.py +++ b/test/python/dummy_tokenizer.py @@ -1,6 +1,7 @@ """ Tokenizer for testing. """ +from nominatim.indexer.place_info import PlaceInfo def create(dsn, data_dir): """ Create a new instance of the tokenizer provided by this module. @@ -68,4 +69,5 @@ class DummyNameAnalyzer: @staticmethod def process_place(place): + assert isinstance(place, PlaceInfo) return {} diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py index ed079269..28c6ef7a 100644 --- a/test/python/test_tokenizer_icu.py +++ b/test/python/test_tokenizer_icu.py @@ -11,6 +11,7 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.db import properties from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.indexer.place_info import PlaceInfo from mock_icu_word_table import MockIcuWordTable @@ -322,30 +323,37 @@ class TestPlaceNames: assert eval(info['names']) == set((t[2] for t in tokens)) + def process_named_place(self, names, country_feature=None): + place = {'name': names} + if country_feature: + place['country_feature'] = country_feature + + return self.analyzer.process_place(PlaceInfo(place)) + + def test_simple_names(self): - info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}}) + info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'}) self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34') @pytest.mark.parametrize('sep', [',' , ';']) def test_names_with_separator(self, sep): - info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}}) + info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))}) self.expect_name_terms(info, '#New York', '#Big Apple', 'new', 'york', 'big', 'apple') def test_full_names_with_bracket(self): - info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}}) + info = self.process_named_place({'name': 'Houseboat (left)'}) self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat', 'houseboat', 'left') def test_country_name(self, word_table): - info = self.analyzer.process_place({'name': {'name': 'Norge'}, - 'country_feature': 'no'}) + info = self.process_named_place({'name': 'Norge'}, country_feature='no') self.expect_name_terms(info, '#norge', 'norge') assert word_table.get_country() == {('no', 'NORGE')} @@ -361,7 +369,7 @@ class TestPlaceAddress: def process_address(self, **kwargs): - return self.analyzer.process_place({'address': kwargs}) + return self.analyzer.process_place(PlaceInfo({'address': kwargs})) def name_token_set(self, *expected_terms): diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py index 4dd3a141..2545c2db 100644 --- a/test/python/test_tokenizer_legacy.py +++ b/test/python/test_tokenizer_legacy.py @@ -5,6 +5,7 @@ import shutil import pytest +from nominatim.indexer.place_info import PlaceInfo from nominatim.tokenizer import legacy_tokenizer from nominatim.db import properties from nominatim.errors import UsageError @@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name): def test_process_place_names(analyzer, make_keywords): - info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}}) + info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}})) assert info['names'] == '{1,2,3}' @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode): - analyzer.process_place({'address': {'postcode' : pcode}}) + analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}})) assert word_table.get_postcodes() == {pcode, } @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836']) def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode): - analyzer.process_place({'address': {'postcode' : pcode}}) + analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}})) assert not word_table.get_postcodes() @@ -319,7 +320,7 @@ class TestHousenumberName: @staticmethod @pytest.mark.parametrize('hnr', ['123a', '1', '101']) def test_process_place_housenumbers_simple(analyzer, hnr): - info = analyzer.process_place({'address': {'housenumber' : hnr}}) + info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}})) assert info['hnr'] == hnr assert info['hnr_tokens'].startswith("{") @@ -327,15 +328,15 @@ class TestHousenumberName: @staticmethod def test_process_place_housenumbers_lists(analyzer): - info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}}) + info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}})) assert set(info['hnr'].split(';')) == set(('1', '2', '3')) @staticmethod def test_process_place_housenumbers_duplicates(analyzer): - info = analyzer.process_place({'address': {'housenumber' : '134', + info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134', 'conscriptionnumber' : '134', - 'streetnumber' : '99a'}}) + 'streetnumber' : '99a'}})) assert set(info['hnr'].split(';')) == set(('134', '99a')) -- 2.39.5