From 8c074821f69099a7fb3b740aadb386cd3618d212 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sun, 28 Jan 2024 20:12:22 +0100 Subject: [PATCH 1/3] Chore: Naming things. Minor fixes. --- .github/workflows/main.yml | 11 +++++++++-- .github/workflows/release.yml | 2 +- .gitignore | 3 +++ README.rst | 2 +- docs/_static/.gitkeep | 0 5 files changed, 14 insertions(+), 4 deletions(-) create mode 100644 docs/_static/.gitkeep diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 981d3ba..c208e41 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,10 +1,17 @@ -name: Run CI +name: Tests on: + pull_request: ~ push: - pull_request: + branches: [ main ] + + # Allow job to be triggered manually. workflow_dispatch: + # Run job each night. + schedule: + - cron: '0 3 * * *' + # Cancel in-progress jobs when pushing to the same branch. concurrency: cancel-in-progress: true diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e396d30..908a8be 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -name: Release new version +name: Release on: push: diff --git a/.gitignore b/.gitignore index 715d693..b23c9cc 100644 --- a/.gitignore +++ b/.gitignore @@ -113,6 +113,9 @@ ENV/ env.bak/ venv.bak/ +# PyCharm project settings +.idea + # Spyder project settings .spyderproject .spyproject diff --git a/README.rst b/README.rst index ccc2374..c538f30 100644 --- a/README.rst +++ b/README.rst @@ -74,7 +74,7 @@ Usage .. code-block:: console - python -m http.server -d build + python -m http.server -d _build/html Please access http://localhost:8000/search.html diff --git a/docs/_static/.gitkeep b/docs/_static/.gitkeep new file mode 100644 index 0000000..e69de29 From 9727ce156ef1ad3a8789b1ec0ea07a00f62b9ce1 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sun, 28 Jan 2024 20:12:04 +0100 Subject: [PATCH 2/3] Dependency: Downgrade to Sphinx 6.x Otherwise, `language_data` renders like `var stopwords = ;`, which is invalid code. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c90bdf8..c0631a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dynamic = ["version", "description"] dependencies = [ "docutils", "peewee", - "Sphinx", + "sphinx<7", ] [project.optional-dependencies] From 5626d7e517770a3d348f5a56b70f9d871a66508e Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sun, 28 Jan 2024 16:14:57 +0100 Subject: [PATCH 3/3] Add backend support for PostgreSQL --- .github/workflows/main.yml | 14 +++++ README.rst | 22 +++++++ docs/conf.py | 1 + docs/getting-started.rst | 7 ++- pyproject.toml | 1 + src/atsphinx/sqlite3fts/__init__.py | 7 ++- src/atsphinx/sqlite3fts/builders.py | 15 +++-- src/atsphinx/sqlite3fts/events.py | 17 +++++- src/atsphinx/sqlite3fts/models.py | 85 +++++++++++++++++++++------- src/atsphinx/sqlite3fts/playhouse.py | 18 ++++++ tests/__init__.py | 1 + tests/conftest.py | 29 ++++++++++ tests/roots/test-default/conf.py | 2 + tests/test_builders.py | 10 +--- tests/test_events.py | 8 +-- tests/test_model.py | 39 +++++++++++++ tests/test_services.py | 6 +- tests/util.py | 54 ++++++++++++++++++ 18 files changed, 283 insertions(+), 53 deletions(-) create mode 100644 src/atsphinx/sqlite3fts/playhouse.py create mode 100644 tests/__init__.py create mode 100644 tests/test_model.py create mode 100644 tests/util.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c208e41..815bee4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -37,6 +37,13 @@ jobs: max-parallel: 4 matrix: python-version: [3.7, 3.8, 3.9, '3.10', '3.11'] + services: + postgresql: + image: postgres:16 + ports: + - 5432:5432 + env: + POSTGRES_HOST_AUTH_METHOD: trust steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} @@ -66,6 +73,13 @@ jobs: ls -l dist documentation: runs-on: ubuntu-latest + services: + postgresql: + image: postgres:16 + ports: + - 5432:5432 + env: + POSTGRES_HOST_AUTH_METHOD: trust steps: - uses: actions/checkout@v3 - name: Setup python diff --git a/README.rst b/README.rst index c538f30..20d9276 100644 --- a/README.rst +++ b/README.rst @@ -79,6 +79,28 @@ Usage Please access http://localhost:8000/search.html +Development +=========== + +Install package in development mode:: + + pip install --editable='.[cli,docs,test]' --prefer-binary + +Start PostgreSQL server:: + + docker run --rm -it --publish=5432:5432 --env "POSTGRES_HOST_AUTH_METHOD=trust" postgres:16 postgres -c log_statement=all + +Invoke software tests:: + + export POSTGRES_LOG_STATEMENT=all + pytest -vvv + +Invoke linters:: + + pip install pre-commit + pre-commit run --all-files + + .. _atsphinx-sqlite3fts: https://pypi.org/project/atsphinx-sqlite3fts/ .. _Kazuya Takei: https://github.com/attakei .. _readthedocs-sphinx-search: https://github.com/readthedocs/readthedocs-sphinx-search diff --git a/docs/conf.py b/docs/conf.py index 22a4647..717b66f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -32,6 +32,7 @@ } # atsphinx-sqlite3fts sqlite3fts_use_search_html = True +sqlite3fts_database_url = "postgresql://postgres@localhost:5432" def setup(app): # noqa: D103 diff --git a/docs/getting-started.rst b/docs/getting-started.rst index 850475a..975971b 100644 --- a/docs/getting-started.rst +++ b/docs/getting-started.rst @@ -47,8 +47,11 @@ You can build database by ``sqlite`` builder. .. code-block:: console - make sqlite - sqlite3 _build/sqlite/db.sqlite + make fts-index + +.. code-block:: console + + psql postgresql://postgres@localhost:5432/ --command 'SELECT * FROM document;' .. code-block:: sqlite3 diff --git a/pyproject.toml b/pyproject.toml index c0631a2..0f8484a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dynamic = ["version", "description"] dependencies = [ "docutils", "peewee", + "psycopg2[binary]", "sphinx<7", ] diff --git a/src/atsphinx/sqlite3fts/__init__.py b/src/atsphinx/sqlite3fts/__init__.py index 7c87e0e..2b98968 100644 --- a/src/atsphinx/sqlite3fts/__init__.py +++ b/src/atsphinx/sqlite3fts/__init__.py @@ -1,4 +1,4 @@ -"""Sphinx document searcher using SQLite3.""" +"""Sphinx document searcher using SQL database.""" from sphinx.application import Sphinx from . import builders, events @@ -10,9 +10,10 @@ def setup(app: Sphinx): """Entrypoint as Sphinx extension.""" app.add_config_value("sqlite3fts_exclude_pages", [], "env") app.add_config_value("sqlite3fts_use_search_html", False, "env") - app.add_builder(builders.SqliteBuilder) + app.add_config_value("sqlite3fts_database_url", None, "env") + app.add_builder(builders.FtsIndexer) app.connect("config-inited", events.setup_search_html) - app.connect("builder-inited", events.configure_database) + app.connect("config-inited", events.configure_database) app.connect("html-page-context", events.register_document) app.connect("build-finished", events.save_database) return { diff --git a/src/atsphinx/sqlite3fts/builders.py b/src/atsphinx/sqlite3fts/builders.py index 4d7468c..bbc5cf6 100644 --- a/src/atsphinx/sqlite3fts/builders.py +++ b/src/atsphinx/sqlite3fts/builders.py @@ -7,13 +7,14 @@ from . import models, services -class SqliteBuilder(Builder): - """Single database generation builder. +class FtsIndexer(Builder): + """ + Fulltext index builder for databases. - This is custom builder to generate only SQLite database file + A custom builder to generate fulltext indexes, stored in SQL databases. """ - name = "sqlite" + name = "fts-index" allow_parallel = True def get_target_uri(self, docname: str, typ: str = None) -> str: # noqa: D102 @@ -23,7 +24,11 @@ def get_outdated_docs(self) -> str: # noqa: D102 return "db.sqlite" def prepare_writing(self, docnames: Set[str]) -> None: # noqa: D102 - pass + from atsphinx.sqlite3fts.models import Content, Document, Section + + Document.truncate_table(cascade=True) + Section.truncate_table(cascade=True) + Content.truncate_table(cascade=True) def write_doc(self, docname: str, doctree: nodes.document) -> None: """Register content of document into database. diff --git a/src/atsphinx/sqlite3fts/events.py b/src/atsphinx/sqlite3fts/events.py index c622276..efeecb7 100644 --- a/src/atsphinx/sqlite3fts/events.py +++ b/src/atsphinx/sqlite3fts/events.py @@ -33,12 +33,23 @@ def _generate_search_html(app: Sphinx): app.connect("html-collect-pages", _generate_search_html) -def configure_database(app: Sphinx): - """Connect database for project output.""" +def configure_database(app: Sphinx, config: Config): + """ + Connect database for project output. + + TODO: Add support for multiple database backends? + """ + # SQLite + """ db_path = Path(app.outdir) / "db.sqlite" if db_path.exists(): db_path.unlink() - models.initialize(db_path) + models.initialize("sqlite", db_path) + """ + # PostgreSQL + if not app.config.sqlite3fts_database_url: + raise ValueError("Configuring database failed") + models.initialize("postgresql", app.config.sqlite3fts_database_url) def register_document( diff --git a/src/atsphinx/sqlite3fts/models.py b/src/atsphinx/sqlite3fts/models.py index 7bb5e51..fe5582a 100644 --- a/src/atsphinx/sqlite3fts/models.py +++ b/src/atsphinx/sqlite3fts/models.py @@ -6,47 +6,52 @@ TODO: Add support for multiple database backends? """ +import os from pathlib import Path from typing import Iterable -from playhouse import sqlite_ext +from peewee import SQL, fn +from playhouse import postgres_ext as ext -db_proxy = sqlite_ext.DatabaseProxy() +from atsphinx.sqlite3fts.playhouse import TSVectorFieldPlus +db_proxy = ext.DatabaseProxy() -class Document(sqlite_ext.Model): + +class Document(ext.Model): """Document main model.""" - page = sqlite_ext.TextField(null=False, unique=True) - title = sqlite_ext.TextField(null=False) + page = ext.TextField(null=False, unique=True) + title = ext.TextField(null=False) class Meta: # noqa: D106 database = db_proxy -class Section(sqlite_ext.Model): +class Section(ext.Model): """Section unit of document.""" - document = sqlite_ext.ForeignKeyField(Document) - root = sqlite_ext.BooleanField(default=False, null=False) - ref = sqlite_ext.TextField(null=False) - title = sqlite_ext.TextField(null=False) - body = sqlite_ext.TextField(null=False) + document = ext.ForeignKeyField(Document) + root = ext.BooleanField(default=False, null=False) + ref = ext.TextField(null=False) + title = ext.TextField(null=False) + body = ext.TextField(null=False) class Meta: # noqa: D106 database = db_proxy -class Content(sqlite_ext.FTS5Model): +class Content(ext.Model): """Searching model.""" - rowid = sqlite_ext.RowIDField() - title = sqlite_ext.SearchField() - body = sqlite_ext.SearchField() + rowid = ext.IntegerField() + title = TSVectorFieldPlus() + body = TSVectorFieldPlus() class Meta: # noqa: D106 database = db_proxy - options = {"tokenize": "trigram"} + # TODO: This is an option from SQLite, it does not work on other DBMS. + # options = {"tokenize": "trigram"} def store_document(document: Document, sections: Iterable[Section]): @@ -58,32 +63,68 @@ def store_document(document: Document, sections: Iterable[Section]): Content.insert( { Content.rowid: section.id, - Content.title: section.title or document.title, - Content.body: section.body, + Content.title: fn.to_tsvector(section.title or document.title), + Content.body: fn.to_tsvector(section.body), } ).execute() def search_documents(keyword: str) -> Iterable[Section]: """Search documents from keyword by full-text-search.""" + # SQLite. + """ return ( Section.select() .join(Content, on=(Section.id == Content.rowid)) .where(Content.match(keyword)) .order_by(Content.bm25()) ) + """ + + # PostgreSQL. + # https://www.postgresql.org/docs/current/textsearch-controls.html + # https://stackoverflow.com/questions/25033184/postgresql-full-text-search-performance-not-acceptable-when-ordering-by-ts-rank/25245291#25245291 + return ( + Section.select( + Section, + fn.ts_rank_cd(Content.title, fn.websearch_to_tsquery(keyword), 32).alias( + "rank_title" + ), + fn.ts_rank_cd(Content.body, fn.websearch_to_tsquery(keyword), 32).alias( + "rank_body" + ), + ) + .join(Content, on=(Section.id == Content.rowid)) + .where( + Content.title.match(keyword, web=True) + | Content.body.match(keyword, web=True) + ) + .order_by( + SQL("rank_title").desc(), + SQL("rank_body").desc(), + ) + ) -def bind(db_path: Path): +def bind(db_type: str, db_path: Path): """Bind connection. This works only set db into proxy, not included creating tables. """ - db = sqlite_ext.SqliteExtDatabase(db_path) + if db_type == "sqlite": + db = ext.SqliteExtDatabase(db_path) + elif db_type == "postgresql": + db = ext.PostgresqlExtDatabase(db_path) + if "POSTGRES_LOG_STATEMENT" in os.environ: + db.execute_sql( + f"SET log_statement='{os.environ['POSTGRES_LOG_STATEMENT']}';" + ) + else: + raise ValueError(f"Unknown database type: {db_type}") db_proxy.initialize(db) -def initialize(db_path: Path): +def initialize(db_type: str, db_path: Path): """Bind connection and create tables.""" - bind(db_path) + bind(db_type, db_path) db_proxy.create_tables([Document, Section, Content]) diff --git a/src/atsphinx/sqlite3fts/playhouse.py b/src/atsphinx/sqlite3fts/playhouse.py new file mode 100644 index 0000000..9724242 --- /dev/null +++ b/src/atsphinx/sqlite3fts/playhouse.py @@ -0,0 +1,18 @@ +"""Peewee/Playhouse extension.""" +from peewee import Expression, Field, TextField, fn +from playhouse.postgres_ext import TS_MATCH, IndexedFieldMixin + + +class TSVectorFieldPlus(IndexedFieldMixin, TextField): + """An advanced `TSVectorField`, capable to use `websearch_to_tsquery`.""" + + field_type = "TSVECTOR" + __hash__ = Field.__hash__ + + def match(self, query, language=None, plain=False, web=False): + """Run match.""" + params = (language, query) if language is not None else (query,) + func = fn.plainto_tsquery if plain else fn.to_tsquery + if web: + func = fn.websearch_to_tsquery + return Expression(self, TS_MATCH, func(*params)) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..38bb211 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Test package.""" diff --git a/tests/conftest.py b/tests/conftest.py index 744cb8d..4d20fef 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,40 @@ """Configuration for pytest.""" +import os + import pytest from sphinx.testing.path import path +from tests.util import Database + pytest_plugins = "sphinx.testing.fixtures" collect_ignore = ["roots"] +@pytest.fixture(scope="session") +def database_dsn(): + """Pytest fixture providing the database connection string for software tests.""" + return "postgresql://postgres@localhost:5432" + + +@pytest.fixture(scope="session") +def database(database_dsn): + """Pytest fixture returning a database wrapper object.""" + return Database(database_dsn) + + +@pytest.fixture +def conn(database): + """ + Pytest fixture returning a database wrapper object, with content cleared. + + This is intended to provide each test case with a blank slate. + """ + if "POSTGRES_LOG_STATEMENT" in os.environ: + database.execute(f"SET log_statement='{os.environ['POSTGRES_LOG_STATEMENT']}';") + database.reset() + return database + + @pytest.fixture(scope="session") def rootdir(): """Set root directory to use testing sphinx project.""" diff --git a/tests/roots/test-default/conf.py b/tests/roots/test-default/conf.py index 448f16d..63b7ac1 100644 --- a/tests/roots/test-default/conf.py +++ b/tests/roots/test-default/conf.py @@ -3,6 +3,8 @@ "atsphinx.sqlite3fts", ] +sqlite3fts_database_url = "postgresql://postgres@localhost:5432" + # To skip toctree rst_prolog = """ :orphan: diff --git a/tests/test_builders.py b/tests/test_builders.py index 65844fe..45009f9 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -1,17 +1,11 @@ """Test cases for custom builders.""" -import sqlite3 -from pathlib import Path - import pytest from sphinx.testing.util import SphinxTestApp -@pytest.mark.sphinx("sqlite", testroot="default") -def test___work_builder(app: SphinxTestApp, status, warning): # noqa +@pytest.mark.sphinx("fts-index", testroot="default") +def test___work_builder(app: SphinxTestApp, status, warning, conn): # noqa app.build() - db_path = Path(app.outdir) / "db.sqlite" - assert db_path.exists() - conn = sqlite3.connect(db_path) assert len(conn.execute("SELECT * FROM document").fetchall()) > 0 assert len(conn.execute("SELECT * FROM section").fetchall()) > 0 assert len(conn.execute("SELECT * FROM content").fetchall()) > 0 diff --git a/tests/test_events.py b/tests/test_events.py index 7087b41..e517c0a 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -1,15 +1,9 @@ """Test cases for result of events.""" -import sqlite3 -from pathlib import Path - import pytest from sphinx.testing.util import SphinxTestApp @pytest.mark.sphinx("html", testroot="default") -def test___work_builder(app: SphinxTestApp, status, warning): # noqa +def test___work_builder(app: SphinxTestApp, status, warning, conn): # noqa app.build() - db_path = Path(app.outdir) / "db.sqlite" - assert db_path.exists() - conn = sqlite3.connect(db_path) assert len(conn.execute("SELECT * FROM document").fetchall()) > 0 diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 0000000..1179356 --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,39 @@ +"""Test cases for model.""" +import pytest +from sphinx.testing.util import SphinxTestApp + +from atsphinx.sqlite3fts import models + + +@pytest.mark.sphinx("fts-index", testroot="default") +def test_search_one_hit(app: SphinxTestApp, status, warning, conn): + """Verify searching with one hit.""" + app.build() + sections = list(models.search_documents("Sub 1-2")) + assert len(sections) == 1 + + assert sections[0].document.page == "index" + assert sections[0].document.title == "Test document" + assert sections[0].ref == "section-1" + assert sections[0].title == "Section 1" + assert sections[0].body == "Sub 1-1\n\nFirst content\nSub 1-2\n\nSecond content" + + +@pytest.mark.sphinx("fts-index", testroot="default") +def test_search_two_hits(app: SphinxTestApp, status, warning, conn): + """Verify searching with two hits.""" + app.build() + sections = list(models.search_documents("section")) + assert len(sections) == 2 + + assert sections[0].document.page == "index" + assert sections[0].document.title == "Test document" + assert sections[0].ref == "section-1" + assert sections[0].title == "Section 1" + assert sections[0].body == "Sub 1-1\n\nFirst content\nSub 1-2\n\nSecond content" + + assert sections[1].document.page == "index" + assert sections[1].document.title == "Test document" + assert sections[1].ref == "section-2" + assert sections[1].title == "Section 2" + assert sections[1].body == "Sub 2-1\n\nEnd" diff --git a/tests/test_services.py b/tests/test_services.py index 83aa374..bbda336 100644 --- a/tests/test_services.py +++ b/tests/test_services.py @@ -10,7 +10,7 @@ RST_DIR = Path(__file__).parent / "rst" -@pytest.mark.sphinx("sqlite", testroot="default") +@pytest.mark.sphinx("fts-index", testroot="default") def test_single_section(app: SphinxTestApp): # noqa rst_path = RST_DIR / "single-section.rst" doctree = parse(app, rst_path.read_text()) @@ -24,7 +24,7 @@ def test_single_section(app: SphinxTestApp): # noqa assert sections[0].root -@pytest.mark.sphinx("sqlite", testroot="default") +@pytest.mark.sphinx("fts-index", testroot="default") def test_sub_sections(app: SphinxTestApp): # noqa rst_path = RST_DIR / "sub-sections.rst" doctree = parse(app, rst_path.read_text()) @@ -35,7 +35,7 @@ def test_sub_sections(app: SphinxTestApp): # noqa assert not sections[1].root -@pytest.mark.sphinx("sqlite", testroot="default") +@pytest.mark.sphinx("fts-index", testroot="default") def test_multiple_sub_sections(app: SphinxTestApp): # noqa rst_path = RST_DIR / "multiple-sub-sections.rst" doctree = parse(app, rst_path.read_text()) diff --git a/tests/util.py b/tests/util.py new file mode 100644 index 0000000..e2ac498 --- /dev/null +++ b/tests/util.py @@ -0,0 +1,54 @@ +"""Database wrapper utility.""" +import psycopg2 + + +class Database: + """ + Wrap connection to the database in a test context. + + TODO: Re-add adapter for SQLite. + TODO: Add adapters for other databases. + """ + + def __init__(self, dsn: str): + """Object constructor.""" + self.dsn = dsn + self.conn = None + self.connect() + + def connect(self): + """Connect to database, optionally disconnecting when already connected.""" + if self.conn is not None: + self.conn.close() + self.conn = psycopg2.connect(self.dsn) + self.conn.autocommit = True + + def execute(self, query): + """Invoke an SQL statement.""" + cursor = self.conn.cursor() + cursor.execute(query) + return Result(cursor=cursor) + + def reset(self): + """Clear database content, to provide each test case with a blank slide.""" + cursor = self.conn.cursor() + cursor.execute("DELETE FROM content;") + cursor.execute("DELETE FROM section;") + cursor.execute("DELETE FROM document;") + cursor.close() + + +class Result: + """Wrap SQLAlchemy result object.""" + + def __init__(self, cursor): + """Object constructor.""" + self.cursor = cursor + + def fetchall(self): + """Fetch all records, exhaustively.""" + return self.cursor.fetchall() + + def __del__(self): + """When object is destroyed, also close the cursor.""" + self.cursor.close()