set(BUILD_API on CACHE BOOL "Build everything for the API server")
set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
set(BUILD_TESTS on CACHE BOOL "Build test suite")
- set(BUILD_DOCS on CACHE BOOL "Build documentation")
- set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim")
# python (imports/updates only)
#-----------------------------------------------------------------------------
- if (BUILD_IMPORTER)
+ if (BUILD_IMPORTER OR BUILD_API)
find_package(PythonInterp 3.7 REQUIRED)
endif()
${PROJECT_BINARY_DIR}/nominatim)
endif()
+#-----------------------------------------------------------------------------
+# Targets for running a development webserver from the build directory.
+#-----------------------------------------------------------------------------
+
+if (BUILD_API)
+ set(WEBSITEFILES
+ 403.html
+ 509.html
+ crossdomain.xml
+ favicon.ico
+ nominatim.xml
+ robots.txt
+ taginfo.json
+ )
+
+ foreach (webfile ${WEBSITEFILES})
+ configure_file(${PROJECT_SOURCE_DIR}/website/${webfile}
+ ${PROJECT_BINARY_DIR}/website/${webfile})
+ endforeach()
+endif()
+
#-----------------------------------------------------------------------------
# Tests
#-----------------------------------------------------------------------------
add_subdirectory(module)
endif()
- #-----------------------------------------------------------------------------
- # Documentation
- #-----------------------------------------------------------------------------
-
- if (BUILD_DOCS)
- add_subdirectory(docs)
- endif()
-
- #-----------------------------------------------------------------------------
- # Manual page
- #-----------------------------------------------------------------------------
-
- if (BUILD_MANPAGE)
- add_subdirectory(man)
- endif()
-
#-----------------------------------------------------------------------------
# Installation
#-----------------------------------------------------------------------------
DESTINATION ${CMAKE_INSTALL_BINDIR}
RENAME nominatim)
- install(DIRECTORY nominatim
- DESTINATION ${NOMINATIM_LIBDIR}/lib-python
- FILES_MATCHING PATTERN "*.py"
- PATTERN "paths.py" EXCLUDE
- PATTERN __pycache__ EXCLUDE)
-
if (EXISTS ${PHP_BIN})
configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py.tmpl paths-py.installed)
else()
configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed)
endif()
- install(FILES ${PROJECT_BINARY_DIR}/paths-py.installed
- DESTINATION ${NOMINATIM_LIBDIR}/lib-python/nominatim
- RENAME paths.py)
+
+ foreach (submodule nominatim_db nominatim_api)
+ install(DIRECTORY src/${submodule}
+ DESTINATION ${NOMINATIM_LIBDIR}/lib-python
+ FILES_MATCHING PATTERN "*.py"
+ PATTERN "paths.py" EXCLUDE
+ PATTERN __pycache__ EXCLUDE)
+ install(FILES ${PROJECT_BINARY_DIR}/paths-py.installed
+ DESTINATION ${NOMINATIM_LIBDIR}/lib-python/${submodule}
+ RENAME paths.py)
+ endforeach()
install(DIRECTORY lib-sql DESTINATION ${NOMINATIM_LIBDIR})
- # SPDX-License-Identifier: GPL-2.0-only
+ # SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
- # Copyright (C) 2022 by the Nominatim developer community.
+ # Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4 but using
from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
Dict, Set, Iterable
import itertools
- import json
import logging
from pathlib import Path
from textwrap import dedent
- from nominatim.db.connection import connect, Connection, Cursor
- from nominatim.config import Configuration
- from nominatim.db.utils import CopyBuffer
- from nominatim.db.sql_preprocessor import SQLPreprocessor
- from nominatim.data.place_info import PlaceInfo
- from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
- from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
- from nominatim.data.place_name import PlaceName
- from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
- from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
+ from psycopg.types.json import Jsonb
+ from psycopg import sql as pysql
+
+ from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
+ drop_tables, table_exists, execute_scalar
+ from ..config import Configuration
+ from ..db.sql_preprocessor import SQLPreprocessor
+ from ..data.place_info import PlaceInfo
+ from ..data.place_name import PlaceName
+ from .icu_rule_loader import ICURuleLoader
+ from .place_sanitizer import PlaceSanitizer
+ from .icu_token_analysis import ICUTokenAnalysis
+ from .base import AbstractAnalyzer, AbstractTokenizer
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
- if not conn.table_exists('search_name'):
+ if not table_exists(conn, 'search_name'):
return
with conn.cursor() as cur:
cur.execute('ANALYSE search_name')
if threads > 1:
- cur.execute('SET max_parallel_workers_per_gather TO %s',
- (min(threads, 6),))
+ cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
+ .format(pysql.Literal(min(threads, 6),)))
- if conn.server_version_tuple() < (12, 0):
+ if server_version_tuple(conn) < (12, 0):
LOG.info('Computing word frequencies')
- cur.drop_table('word_frequencies')
- cur.drop_table('addressword_frequencies')
+ drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
$$ LANGUAGE plpgsql IMMUTABLE;
""")
LOG.info('Update word table with recomputed frequencies')
- cur.drop_table('tmp_word')
+ drop_tables(conn, 'tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
word_freq_update(word_id, info) as info
FROM word
""")
- cur.drop_table('word_frequencies')
- cur.drop_table('addressword_frequencies')
+ drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
else:
LOG.info('Computing word frequencies')
- cur.drop_table('word_frequencies')
- cur.execute('ANALYSE search_name')
- cur.execute('ANALYSE word')
+ drop_tables(conn, 'word_frequencies')
cur.execute("""
CREATE TEMP TABLE word_frequencies AS
WITH word_freq AS MATERIALIZED (
cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
cur.execute('ANALYSE word_frequencies')
LOG.info('Update word table with recomputed frequencies')
- cur.drop_table('tmp_word')
+ drop_tables(conn, 'tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.info is null THEN word.info
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id
+ ORDER BY word_id
""")
- cur.drop_table('word_frequencies')
+ drop_tables(conn, 'word_frequencies')
with conn.cursor() as cur:
cur.execute('SET max_parallel_workers_per_gather TO 0')
""" Remove unused house numbers.
"""
with connect(self.dsn) as conn:
- if not conn.table_exists('search_name'):
+ if not table_exists(conn, 'search_name'):
return
with conn.cursor(name="hnr_counter") as cur:
cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
frequencies.
"""
with connect(self.dsn) as conn:
- with conn.cursor() as cur:
- cur.drop_table('word')
+ drop_tables(conn, 'word')
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn, """
CREATE TABLE word (
""" Rename all tables and indexes used by the tokenizer.
"""
with connect(self.dsn) as conn:
+ drop_tables(conn, 'word')
with conn.cursor() as cur:
- cur.drop_table('word')
cur.execute(f"ALTER TABLE {old} RENAME TO word")
for idx in ('word_token', 'word_id'):
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
token_analysis: ICUTokenAnalysis) -> None:
- self.conn: Optional[Connection] = connect(dsn).connection
+ self.conn: Optional[Connection] = connect(dsn)
self.conn.autocommit = True
self.sanitizer = sanitizer
self.token_analysis = token_analysis
if terms:
with self.conn.cursor() as cur:
- cur.execute_values("""SELECT create_postcode_word(pc, var)
- FROM (VALUES %s) AS v(pc, var)""",
- terms)
+ cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
to_add = new_phrases - existing_phrases
added = 0
- with CopyBuffer() as copystr:
+ with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
for word, cls, typ, oper in to_add:
term = self._search_normalized(word)
if term:
- copystr.add(term, 'S', word,
- json.dumps({'class': cls, 'type': typ,
- 'op': oper if oper in ('in', 'near') else None}))
+ copy.write_row((term, 'S', word,
+ Jsonb({'class': cls, 'type': typ,
+ 'op': oper if oper in ('in', 'near') else None})))
added += 1
- copystr.copy_out(cursor, 'word',
- columns=['word_token', 'type', 'word', 'info'])
-
return added
to_delete = existing_phrases - new_phrases
if to_delete:
- cursor.execute_values(
- """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
- WHERE type = 'S' and word = name
- and info->>'class' = in_class and info->>'type' = in_type
- and ((op = '-' and info->>'op' is null) or op = info->>'op')
+ cursor.executemany(
+ """ DELETE FROM word
+ WHERE type = 'S' and word = %s
+ and info->>'class' = %s and info->>'type' = %s
+ and %s = coalesce(info->>'op', '-')
""", to_delete)
return len(to_delete)
gone_tokens.update(existing_tokens[False] & word_tokens)
if gone_tokens:
cur.execute("""DELETE FROM word
- USING unnest(%s) as token
+ USING unnest(%s::text[]) as token
WHERE type = 'C' and word = %s
and word_token = token""",
(list(gone_tokens), country_code))
if internal:
sql = """INSERT INTO word (word_token, type, word, info)
(SELECT token, 'C', %s, '{"internal": "yes"}'
- FROM unnest(%s) as token)
+ FROM unnest(%s::text[]) as token)
"""
else:
sql = """INSERT INTO word (word_token, type, word)
(SELECT token, 'C', %s
- FROM unnest(%s) as token)
+ FROM unnest(%s::text[]) as token)
"""
cur.execute(sql, (country_code, list(new_tokens)))
if norm_name:
result = self._cache.housenumbers.get(norm_name, result)
if result[0] is None:
- with self.conn.cursor() as cur:
- hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+ hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
- result = hid, norm_name
- self._cache.housenumbers[norm_name] = result
+ result = hid, norm_name
+ self._cache.housenumbers[norm_name] = result
else:
# Otherwise use the analyzer to determine the canonical name.
# Per convention we use the first variant as the 'lookup name', the
if result[0] is None:
variants = analyzer.compute_variants(word_id)
if variants:
- with self.conn.cursor() as cur:
- hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+ hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
(word_id, list(variants)))
- result = hid, variants[0]
- self._cache.housenumbers[word_id] = result
+ result = hid, variants[0]
+ self._cache.housenumbers[word_id] = result
return result