project(nominatim)
set(NOMINATIM_VERSION_MAJOR 4)
- set(NOMINATIM_VERSION_MINOR 4)
+ set(NOMINATIM_VERSION_MINOR 5)
set(NOMINATIM_VERSION_PATCH 0)
set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}")
set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
set(BUILD_API on CACHE BOOL "Build everything for the API server")
- set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
set(BUILD_TESTS on CACHE BOOL "Build test suite")
set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim")
find_package(PythonInterp 3.7 REQUIRED)
endif()
- #-----------------------------------------------------------------------------
- # PHP
- #-----------------------------------------------------------------------------
-
- # Setting PHP binary variable as to command line (prevailing) or auto detect
-
- if (BUILD_API)
- if (NOT PHP_BIN)
- find_program (PHP_BIN php)
- endif()
- # sanity check if PHP binary exists
- if (NOT EXISTS ${PHP_BIN})
- message(WARNING "PHP binary not found. Only Python frontend can be used.")
- set(PHP_BIN "")
- else()
- message (STATUS "Using PHP binary " ${PHP_BIN})
- endif()
- endif()
-
#-----------------------------------------------------------------------------
# import scripts and utilities (importer only)
#-----------------------------------------------------------------------------
${PROJECT_BINARY_DIR}/nominatim)
endif()
+#-----------------------------------------------------------------------------
+# Targets for running a development webserver from the build directory.
+#-----------------------------------------------------------------------------
+
+if (BUILD_API)
+ set(WEBSITEFILES
+ 403.html
+ 509.html
+ crossdomain.xml
+ favicon.ico
+ nominatim.xml
+ robots.txt
+ taginfo.json
+ )
+
+ foreach (webfile ${WEBSITEFILES})
+ configure_file(${PROJECT_SOURCE_DIR}/website/${webfile}
+ ${PROJECT_BINARY_DIR}/website/${webfile})
+ endforeach()
+endif()
+
#-----------------------------------------------------------------------------
# Tests
#-----------------------------------------------------------------------------
find_program(PYTHON_BEHAVE behave)
find_program(PYLINT NAMES pylint3 pylint)
find_program(PYTEST NAMES pytest py.test-3 py.test)
- find_program(PHPCS phpcs)
- find_program(PHPUNIT phpunit)
if (PYTHON_BEHAVE)
message(STATUS "Using Python behave binary ${PYTHON_BEHAVE}")
message(WARNING "behave not found. BDD tests disabled." )
endif()
- if (PHPUNIT)
- message(STATUS "Using phpunit binary ${PHPUNIT}")
- add_test(NAME php
- COMMAND ${PHPUNIT} ./
- WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/test/php)
- else()
- message(WARNING "phpunit not found. PHP unit tests disabled." )
- endif()
-
- if (PHPCS)
- message(STATUS "Using phpcs binary ${PHPCS}")
- add_test(NAME phpcs
- COMMAND ${PHPCS} --report-width=120 --colors lib-php
- WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
- else()
- message(WARNING "phpcs not found. PHP linting tests disabled." )
- endif()
-
if (PYLINT)
message(STATUS "Using pylint binary ${PYLINT}")
add_test(NAME pylint
endif()
endif()
- #-----------------------------------------------------------------------------
- # Postgres module
- #-----------------------------------------------------------------------------
-
- if (BUILD_MODULE)
- add_subdirectory(module)
- endif()
-
#-----------------------------------------------------------------------------
# Installation
#-----------------------------------------------------------------------------
DESTINATION ${CMAKE_INSTALL_BINDIR}
RENAME nominatim)
- if (EXISTS ${PHP_BIN})
- configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py.tmpl paths-py.installed)
- else()
- configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed)
- endif()
+ configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed)
foreach (submodule nominatim_db nominatim_api)
install(DIRECTORY src/${submodule}
endif()
endif()
- if (BUILD_MODULE)
- install(PROGRAMS ${PROJECT_BINARY_DIR}/module/nominatim.so
- DESTINATION ${NOMINATIM_LIBDIR}/module)
- endif()
-
- if (BUILD_API AND EXISTS ${PHP_BIN})
- install(DIRECTORY lib-php DESTINATION ${NOMINATIM_LIBDIR})
- endif()
-
install(FILES settings/env.defaults
settings/address-levels.json
settings/phrase-settings.json
expected_count = sum(t.count for t in hnrs)
partials = {t.token: t.addr_count for trange in address
- for t in self.query.get_partials_list(trange)
- if t.is_indexed}
+ for t in self.query.get_partials_list(trange)}
if not partials:
# can happen when none of the partials is indexed
addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
addr_tokens = list({t.token for t in addr_partials})
- partials_indexed = all(t.is_indexed for t in name_partials.values()) \
- and all(t.is_indexed for t in addr_partials)
exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
- if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
+ if (len(name_partials) > 3 or exp_count < 8000):
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
return
- addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
+ addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
# Partial term to frequent. Try looking up by rare full names first.
name_fulls = self.query.get_tokens(name, TokenType.WORD)
if name_fulls:
fulls_count = sum(t.count for t in name_fulls)
- if partials_indexed:
- penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
- if fulls_count < 50000 or addr_count < 30000:
+ if fulls_count < 80000 or addr_count < 50000:
yield penalty,fulls_count / (2**len(addr_tokens)), \
self.get_full_name_ranking(name_fulls, addr_partials,
fulls_count > 30000 / max(1, len(addr_tokens)))
# To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected.
exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
- if exp_count < 10000 and addr_count < 20000\
- and all(t.is_indexed for t in name_partials.values()):
+ if exp_count < 10000 and addr_count < 20000:
penalty += 0.35 * max(1 if name_fulls else 0.1,
5 - len(name_partials) - len(addr_tokens))
yield penalty, exp_count,\
addr_restrict_tokens = []
addr_lookup_tokens = []
for t in addr_partials:
- if t.is_indexed:
- if t.addr_count > 20000:
- addr_restrict_tokens.append(t.token)
- else:
- addr_lookup_tokens.append(t.token)
+ if t.addr_count > 20000:
+ addr_restrict_tokens.append(t.token)
+ else:
+ addr_lookup_tokens.append(t.token)
if addr_restrict_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector',
# This might yield wrong results, nothing we can do about that.
if use_lookup:
addr_restrict_tokens = []
- addr_lookup_tokens = []
- for t in addr_partials:
- if t.addr_count > 20000:
- addr_restrict_tokens.append(t.token)
- else:
- addr_lookup_tokens.append(t.token)
+ addr_lookup_tokens = [t.token for t in addr_partials if t.is_indexed]
else:
- addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
+ addr_restrict_tokens = [t.token for t in addr_partials]
addr_lookup_tokens = []
return dbf.lookup_by_any_name([t.token for t in name_fulls],
lookup_word = row.word_token
return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count),
- lookup_word=lookup_word, is_indexed=True,
+ lookup_word=lookup_word,
word_token=row.word_token, info=row.info,
addr_count=max(1, addr_count))
standardized form search will work with. All information removed
at this stage is inevitably lost.
"""
- return cast(str, self.normalizer.transliterate(text))
+ norm = cast(str, self.normalizer.transliterate(text))
+ numspaces = norm.count(' ')
+ if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
+ return ''
+
+ return norm
def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
if len(part.token) <= 4 and part[0].isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
- ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
+ ICUToken(penalty=0.5, token=0,
+ count=1, addr_count=1, lookup_word=part.token,
+ word_token=part.token, info=None))
def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
import itertools
import logging
from pathlib import Path
- from textwrap import dedent
from psycopg.types.json import Jsonb
from psycopg import sql as pysql
"""
self.loader = ICURuleLoader(config)
- self._install_php(config.lib_dir.php, overwrite=True)
self._save_config()
if init_db:
with connect(self.dsn) as conn:
self.loader.load_config_from_db(conn)
- self._install_php(config.lib_dir.php, overwrite=False)
-
def finalize_import(self, config: Configuration) -> None:
""" Do any required postprocessing to make the tokenizer data ready
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id
+ ORDER BY word_id
""")
drop_tables(conn, 'word_frequencies')
return list(s[0].split('@')[0] for s in cur)
- def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
- """ Install the php script for the tokenizer.
- """
- if phpdir is not None:
- assert self.loader is not None
- php_file = self.data_dir / "tokenizer.php"
-
- if not php_file.exists() or overwrite:
- php_file.write_text(dedent(f"""\
- <?php
- @define('CONST_Max_Word_Frequency', 10000000);
- @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
- @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
- require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
-
-
def _save_config(self) -> None:
""" Save the configuration that needs to remain stable for the given
database as database properties.