From: Sarah Hoffmann Date: Tue, 28 Sep 2021 07:45:15 +0000 (+0200) Subject: Merge pull request #2454 from lonvia/sort-out-token-assignment-in-sql X-Git-Tag: v4.0.0~25 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/40f9d52ad8b48e0dc6b1ac89305c1e4ef1c43884?hp=779ea8ac624187850752544eae0209f811bc1551 Merge pull request #2454 from lonvia/sort-out-token-assignment-in-sql ICU tokenizer: switch match method to using partial terms --- diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md index 529315e4..5282db1a 100644 --- a/docs/develop/Tokenizers.md +++ b/docs/develop/Tokenizers.md @@ -190,22 +190,21 @@ be listed with a semicolon as delimiter. Must be NULL when the place has no house numbers. ```sql -FUNCTION token_addr_street_match_tokens(info JSONB) RETURNS INTEGER[] +FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN ``` -Return the match token IDs by which to search a matching street from the -`addr:street` tag. These IDs will be matched against the IDs supplied by -`token_get_name_match_tokens`. Must be NULL when the place has no `addr:street` -tag. +Check if the given tokens (previously saved from `token_get_name_match_tokens()`) +match against the `addr:street` tag name. Must return either NULL or FALSE +when the place has no `addr:street` tag. ```sql -FUNCTION token_addr_place_match_tokens(info JSONB) RETURNS INTEGER[] +FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN ``` -Return the match token IDs by which to search a matching place from the -`addr:place` tag. These IDs will be matched against the IDs supplied by -`token_get_name_match_tokens`. Must be NULL when the place has no `addr:place` -tag. +Check if the given tokens (previously saved from `token_get_name_match_tokens()`) +match against the `addr:place` tag name. Must return either NULL or FALSE +when the place has no `addr:place` tag. + ```sql FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[] @@ -216,26 +215,34 @@ are used for searches by address when no matching place can be found in the database. Must be NULL when the place has no `addr:place` tag. ```sql -CREATE TYPE token_addresstoken AS ( - key TEXT, - match_tokens INT[], - search_tokens INT[] -); +FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT +``` + +Return the set of keys for which address information is provided. This +should correspond to the list of (relevant) `addr:*` tags with the `addr:` +prefix removed or the keys used in the `address` dictionary of the place info. -FUNCTION token_get_address_tokens(info JSONB) RETURNS SETOF token_addresstoken +```sql +FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[] ``` -Return the match and search token IDs for explicit `addr:*` tags for the place -other than `addr:street` and `addr:place`. For each address item there are -three pieces of information returned: - - * _key_ contains the type of address item (city, county, etc.). This is the - key handed in with the `address` dictionary. - * *match_tokens* is the list of token IDs used to find the corresponding - place object for the address part. The list is matched against the IDs - from `token_get_name_match_tokens`. - * *search_tokens* is the list of token IDs under which to search the address - item. It is used when no corresponding place object was found. +Return the array of search tokens for the given address part. `key` can be +expected to be one of those returned with `token_get_address_keys()`. The +search tokens are added to the address search vector of the place, when no +corresponding OSM object could be found for the given address part from which +to copy the name information. + +```sql +FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[]) +``` + +Check if the given tokens match against the address part `key`. + +__Warning:__ the tokens that are handed in are the lists previously saved +from `token_get_name_search_tokens()`, _not_ from the match token list. This +is an historical oddity which will be fixed at some point in the future. +Currently, tokenizers are encouraged to make sure that matching works against +both the search token list and the match token list. ```sql FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql index 55e44dfd..4ef36f4f 100644 --- a/lib-sql/functions/interpolation.sql +++ b/lib-sql/functions/interpolation.sql @@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE; -- find the parent road of the cut road parts -CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[], +CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB, partition SMALLINT, centroid GEOMETRY, geom GEOMETRY) RETURNS BIGINT @@ -52,7 +52,7 @@ DECLARE parent_place_id BIGINT; location RECORD; BEGIN - parent_place_id := find_parent_for_address(street, place, partition, centroid); + parent_place_id := find_parent_for_address(token_info, partition, centroid); IF parent_place_id is null THEN FOR location IN SELECT place_id FROM placex @@ -155,9 +155,8 @@ BEGIN NEW.interpolationtype = NEW.address->'interpolation'; place_centroid := ST_PointOnSurface(NEW.linegeo); - NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info), - token_addr_place_match_tokens(NEW.token_info), - NEW.partition, place_centroid, NEW.linegeo); + NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition, + place_centroid, NEW.linegeo); interpol_postcode := token_normalized_postcode(NEW.address->'postcode'); diff --git a/lib-sql/functions/partition-functions.sql b/lib-sql/functions/partition-functions.sql index 53aba22c..97afec15 100644 --- a/lib-sql/functions/partition-functions.sql +++ b/lib-sql/functions/partition-functions.sql @@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE; CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY, from_rank SMALLINT, to_rank SMALLINT, - extent FLOAT, tokens INT[]) + extent FLOAT, token_info JSONB, key TEXT) RETURNS nearfeaturecentr AS $$ DECLARE @@ -80,7 +80,7 @@ BEGIN FROM location_area_large_{{ partition }} WHERE geometry && ST_Expand(feature, extent) AND rank_address between from_rank and to_rank - AND tokens && keywords + AND token_matches_address(token_info, key, keywords) GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1; RETURN r; @@ -148,18 +148,21 @@ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER, point GEOMETRY, - isin_token INTEGER[]) + token_info JSONB) RETURNS BIGINT AS $$ DECLARE parent BIGINT; BEGIN + IF not token_has_addr_street(token_info) THEN + RETURN NULL; + END IF; {% for partition in db.partitions %} IF in_partition = {{ partition }} THEN SELECT place_id FROM search_name_{{ partition }} INTO parent - WHERE name_vector && isin_token + WHERE token_matches_street(token_info, name_vector) AND centroid && ST_Expand(point, 0.015) AND address_rank between 26 and 27 ORDER BY ST_Distance(centroid, point) ASC limit 1; @@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE; CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER, point GEOMETRY, - isin_token INTEGER[]) + token_info JSONB) RETURNS BIGINT AS $$ DECLARE parent BIGINT; BEGIN + IF not token_has_addr_place(token_info) THEN + RETURN NULL; + END IF; {% for partition in db.partitions %} IF in_partition = {{ partition }} THEN SELECT place_id INTO parent FROM search_name_{{ partition }} - WHERE name_vector && isin_token + WHERE token_matches_place(token_info, name_vector) AND centroid && ST_Expand(point, 0.04) AND address_rank between 16 and 25 ORDER BY ST_Distance(centroid, point) ASC limit 1; diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index fa7156ec..9c2a67a1 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -104,8 +104,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1), poi_osm_id BIGINT, poi_partition SMALLINT, bbox GEOMETRY, - addr_street INTEGER[], - addr_place INTEGER[], + token_info JSONB, is_place_addr BOOLEAN) RETURNS BIGINT AS $$ @@ -119,8 +118,7 @@ BEGIN parent_place_id := find_associated_street(poi_osm_type, poi_osm_id); IF parent_place_id is null THEN - parent_place_id := find_parent_for_address(addr_street, addr_place, - poi_partition, bbox); + parent_place_id := find_parent_for_address(token_info, poi_partition, bbox); END IF; IF parent_place_id is null and poi_osm_type = 'N' THEN @@ -333,13 +331,14 @@ BEGIN WHERE s.place_id = parent_place_id; FOR addr_item IN - SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens - FROM token_get_address_tokens(token_info) - WHERE not search_tokens <@ parent_address_vector + SELECT (get_addr_tag_rank(key, country)).*, key, + token_get_address_search_tokens(token_info, key) as search_tokens + FROM token_get_address_keys(token_info) as key + WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector LOOP addr_place := get_address_place(in_partition, geometry, addr_item.from_rank, addr_item.to_rank, - addr_item.extent, addr_item.match_tokens); + addr_item.extent, token_info, addr_item.key); IF addr_place is null THEN -- No place found in OSM that matches. Make it at least searchable. @@ -447,14 +446,16 @@ BEGIN FOR location IN SELECT (get_address_place(partition, geometry, from_rank, to_rank, - extent, match_tokens)).*, search_tokens - FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens - FROM token_get_address_tokens(token_info)) x + extent, token_info, key)).*, key + FROM (SELECT (get_addr_tag_rank(key, country)).*, key + FROM token_get_address_keys(token_info) as key) x ORDER BY rank_address, distance, isguess desc LOOP IF location.place_id is null THEN {% if not db.reverse_only %} - nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens); + nameaddress_vector := array_merge(nameaddress_vector, + token_get_address_search_tokens(token_info, + location.key)); {% endif %} ELSE {% if not db.reverse_only %} @@ -689,9 +690,6 @@ DECLARE parent_address_level SMALLINT; place_address_level SMALLINT; - addr_street INTEGER[]; - addr_place INTEGER[]; - max_rank SMALLINT; name_vector INTEGER[]; @@ -860,8 +858,6 @@ BEGIN END IF; NEW.housenumber := token_normalized_housenumber(NEW.token_info); - addr_street := token_addr_street_match_tokens(NEW.token_info); - addr_place := token_addr_place_match_tokens(NEW.token_info); NEW.postcode := null; @@ -907,7 +903,7 @@ BEGIN NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id, NEW.partition, ST_Envelope(NEW.geometry), - addr_street, addr_place, + NEW.token_info, is_place_address); -- If we found the road take a shortcut here. diff --git a/lib-sql/functions/utils.sql b/lib-sql/functions/utils.sql index c308d025..f7d2093c 100644 --- a/lib-sql/functions/utils.sql +++ b/lib-sql/functions/utils.sql @@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE; -- Find the parent of an address with addr:street/addr:place tag. -- --- \param street Value of addr:street or NULL if tag is missing. --- \param place Value of addr:place or NULL if tag is missing. +-- \param token_info Naming info with the address information. -- \param partition Partition where to search the parent. -- \param centroid Location of the address. -- -- \return Place ID of the parent if one was found, NULL otherwise. -CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[], +CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB, partition SMALLINT, centroid GEOMETRY) RETURNS BIGINT @@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG DECLARE parent_place_id BIGINT; BEGIN - IF street is not null THEN - -- Check for addr:street attributes - -- Note that addr:street links can only be indexed, once the street itself is indexed - parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street); - IF parent_place_id is not null THEN - {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %} - RETURN parent_place_id; - END IF; + -- Check for addr:street attributes + parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info); + IF parent_place_id is not null THEN + {% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %} + RETURN parent_place_id; END IF; -- Check for addr:place attributes. - IF place is not null THEN - parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place); - IF parent_place_id is not null THEN - {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %} - RETURN parent_place_id; - END IF; - END IF; - - RETURN NULL; + parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info); + {% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %} + RETURN parent_place_id; END; $$ LANGUAGE plpgsql STABLE; + CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT) RETURNS BOOLEAN AS $$ diff --git a/lib-sql/tiger_import_start.sql b/lib-sql/tiger_import_start.sql index faa4efbb..f344e174 100644 --- a/lib-sql/tiger_import_start.sql +++ b/lib-sql/tiger_import_start.sql @@ -14,7 +14,6 @@ DECLARE out_partition INTEGER; out_parent_place_id BIGINT; location RECORD; - address_street_word_ids INTEGER[]; BEGIN @@ -54,13 +53,9 @@ BEGIN place_centroid := ST_Centroid(linegeo); out_partition := get_partition('us'); - out_parent_place_id := null; - address_street_word_ids := token_addr_street_match_tokens(token_info); - IF address_street_word_ids IS NOT NULL THEN - out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid, - address_street_word_ids); - END IF; + out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid, + token_info); IF out_parent_place_id IS NULL THEN SELECT getNearestParallelRoadFeature(out_partition, linegeo) diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index ffe6648c..6092319a 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -34,40 +34,59 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB) - RETURNS INTEGER[] +CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB) + RETURNS BOOLEAN +AS $$ + SELECT info->>'street' is not null; +$$ LANGUAGE SQL IMMUTABLE; + + +CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB) + RETURNS BOOLEAN AS $$ - SELECT (info->>'street')::INTEGER[] + SELECT info->>'place' is not null; +$$ LANGUAGE SQL IMMUTABLE; + + +CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) + RETURNS BOOLEAN +AS $$ + SELECT (info->>'street')::INTEGER[] <@ street_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB) - RETURNS INTEGER[] +CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) + RETURNS BOOLEAN AS $$ - SELECT (info->>'place_match')::INTEGER[] + SELECT (info->>'place')::INTEGER[] <@ place_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[] AS $$ - SELECT (info->>'place_search')::INTEGER[] + SELECT (info->>'place')::INTEGER[] $$ LANGUAGE SQL IMMUTABLE STRICT; -DROP TYPE IF EXISTS token_addresstoken CASCADE; -CREATE TYPE token_addresstoken AS ( - key TEXT, - match_tokens INT[], - search_tokens INT[] -); +CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB) + RETURNS SETOF TEXT +AS $$ + SELECT * FROM jsonb_object_keys(info->'addr'); +$$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB) - RETURNS SETOF token_addresstoken + +CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) + RETURNS INTEGER[] AS $$ - SELECT key, (value->>1)::int[] as match_tokens, - (value->>0)::int[] as search_tokens - FROM jsonb_each(info->'addr'); + SELECT (info->'addr'->>key)::INTEGER[]; +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[]) + RETURNS BOOLEAN +AS $$ + SELECT (info->'addr'->>key)::INTEGER[] <@ tokens; $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -127,15 +146,34 @@ BEGIN VALUES (term_id, term, 'w', json_build_object('count', term_count)); END IF; - IF term_count < {{ max_word_freq }} THEN - partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); - END IF; + partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); END LOOP; END; $$ LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT) + RETURNS INTEGER + AS $$ +DECLARE + token INTEGER; +BEGIN + SELECT min(word_id) INTO token + FROM word WHERE word_token = partial and type = 'w'; + + IF token IS NULL THEN + token := nextval('seq_word'); + INSERT INTO word (word_id, word_token, type, info) + VALUES (token, partial, 'w', json_build_object('count', 0)); + END IF; + + RETURN token; +END; +$$ +LANGUAGE plpgsql; + + CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) RETURNS INTEGER AS $$ diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index a2c6b520..2b734e6f 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -34,17 +34,31 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB) - RETURNS INTEGER[] +CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB) + RETURNS BOOLEAN +AS $$ + SELECT info->>'street' is not null; +$$ LANGUAGE SQL IMMUTABLE; + + +CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB) + RETURNS BOOLEAN AS $$ - SELECT (info->>'street')::INTEGER[] + SELECT info->>'place_match' is not null; +$$ LANGUAGE SQL IMMUTABLE; + + +CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) + RETURNS BOOLEAN +AS $$ + SELECT (info->>'street')::INTEGER[] && street_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB) - RETURNS INTEGER[] +CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) + RETURNS BOOLEAN AS $$ - SELECT (info->>'place_match')::INTEGER[] + SELECT (info->>'place_match')::INTEGER[] && place_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -55,19 +69,24 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -DROP TYPE IF EXISTS token_addresstoken CASCADE; -CREATE TYPE token_addresstoken AS ( - key TEXT, - match_tokens INT[], - search_tokens INT[] -); +CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB) + RETURNS SETOF TEXT +AS $$ + SELECT * FROM jsonb_object_keys(info->'addr'); +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) + RETURNS INTEGER[] +AS $$ + SELECT (info->'addr'->key->>0)::INTEGER[]; +$$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB) - RETURNS SETOF token_addresstoken + +CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[]) + RETURNS BOOLEAN AS $$ - SELECT key, (value->>1)::int[] as match_tokens, - (value->>0)::int[] as search_tokens - FROM jsonb_each(info->'addr'); + SELECT (info->'addr'->key->>1)::INTEGER[] && tokens; $$ LANGUAGE SQL IMMUTABLE STRICT; diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 61263678..5768fd35 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -17,7 +17,6 @@ from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer -DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() @@ -39,7 +38,6 @@ class LegacyICUTokenizer(AbstractTokenizer): self.data_dir = data_dir self.naming_rules = None self.term_normalization = None - self.max_word_frequency = None def init_new_db(self, config, init_db=True): @@ -52,10 +50,9 @@ class LegacyICUTokenizer(AbstractTokenizer): config='TOKENIZER_CONFIG')) self.naming_rules = ICUNameProcessorRules(loader=loader) self.term_normalization = config.TERM_NORMALIZATION - self.max_word_frequency = config.MAX_WORD_FREQUENCY self._install_php(config.lib_dir.php) - self._save_config(config) + self._save_config() if init_db: self.update_sql_functions(config) @@ -68,7 +65,6 @@ class LegacyICUTokenizer(AbstractTokenizer): with connect(self.dsn) as conn: self.naming_rules = ICUNameProcessorRules(conn=conn) self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) - self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) def finalize_import(self, _): @@ -81,10 +77,8 @@ class LegacyICUTokenizer(AbstractTokenizer): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: - max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ) sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql', - max_word_freq=max_word_freq) + sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql') def check_database(self): @@ -122,20 +116,19 @@ class LegacyICUTokenizer(AbstractTokenizer): php_file = self.data_dir / "tokenizer.php" php_file.write_text(dedent(f"""\ ' '")) as anl: self.analyzer = anl yield anl @@ -424,7 +427,7 @@ class TestPlaceAddress: def test_process_place_street(self): info = self.process_address(street='Grand Road') - assert eval(info['street']) == self.name_token_set('#GRAND ROAD') + assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD') def test_process_place_street_empty(self): @@ -436,16 +439,13 @@ class TestPlaceAddress: def test_process_place_place(self): info = self.process_address(place='Honu Lulu') - assert eval(info['place_search']) == self.name_token_set('#HONU LULU', - 'HONU', 'LULU') - assert eval(info['place_match']) == self.name_token_set('#HONU LULU') + assert eval(info['place']) == self.name_token_set('HONU', 'LULU') def test_process_place_place_empty(self): info = self.process_address(place='🜵') - assert 'place_search' not in info - assert 'place_match' not in info + assert 'place' not in info def test_process_place_address_terms(self): @@ -453,16 +453,12 @@ class TestPlaceAddress: suburb='Zwickau', street='Hauptstr', full='right behind the church') - city_full = self.name_token_set('#ZWICKAU') - city_all = self.name_token_set('#ZWICKAU', 'ZWICKAU') - state_full = self.name_token_set('#SACHSEN') - state_all = self.name_token_set('#SACHSEN', 'SACHSEN') + city = self.name_token_set('ZWICKAU') + state = self.name_token_set('SACHSEN') - result = {k: [eval(v[0]), eval(v[1])] for k,v in info['addr'].items()} + result = {k: eval(v) for k,v in info['addr'].items()} - assert result == {'city': [city_all, city_full], - 'suburb': [city_all, city_full], - 'state': [state_all, state_full]} + assert result == {'city': city, 'suburb': city, 'state': state} def test_process_place_address_terms_empty(self):