Snapshot schema 72 (#13873)

Including another batch of fixes to the schema dump script

Snapshot schema 72 (#13873)
0a38c7ec · David Robertson · GitHub · 41461fd4 · 0a38c7ec · 0a38c7ec
Unverified Commit 0a38c7ec authored 2 years ago by David Robertson Committed by GitHub 2 years ago
--- a/changelog.d/13873.misc
+++ b/changelog.d/13873.misc
+Create a new snapshot of the database schema.
--- a/scripts-dev/make_full_schema.sh
+++ b/scripts-dev/make_full_schema.sh
@@ -26,6 +26,9 @@ usage() {
  echo "  Defaults to 9999."
  echo "-h"
  echo "  Display this help text."
+  echo ""
+  echo "  NB: make sure to run this against the *oldest* supported version of postgres,"
+  echo "  or else pg_dump might output non-backwards-compatible syntax."
 }

 SCHEMA_NUMBER="9999"
@@ -240,25 +243,54 @@ DROP TABLE user_directory_search_stat;

 echo "Dumping SQLite3 schema..."

-mkdir -p "$OUTPUT_DIR/"{common,main,state}"/full_schema/$SCHEMA_NUMBER"
-sqlite3 "$SQLITE_COMMON_DB" ".schema --indent"           > "$OUTPUT_DIR/common/full_schema/$SCHEMA_NUMBER/full.sql.sqlite"
-sqlite3 "$SQLITE_COMMON_DB" ".dump --data-only --nosys" >> "$OUTPUT_DIR/common/full_schema/$SCHEMA_NUMBER/full.sql.sqlite"
-sqlite3 "$SQLITE_MAIN_DB"   ".schema --indent"           > "$OUTPUT_DIR/main/full_schema/$SCHEMA_NUMBER/full.sql.sqlite"
-sqlite3 "$SQLITE_MAIN_DB"   ".dump --data-only --nosys" >> "$OUTPUT_DIR/main/full_schema/$SCHEMA_NUMBER/full.sql.sqlite"
-sqlite3 "$SQLITE_STATE_DB"  ".schema --indent"           > "$OUTPUT_DIR/state/full_schema/$SCHEMA_NUMBER/full.sql.sqlite"
-sqlite3 "$SQLITE_STATE_DB"  ".dump --data-only --nosys" >> "$OUTPUT_DIR/state/full_schema/$SCHEMA_NUMBER/full.sql.sqlite"
+mkdir -p "$OUTPUT_DIR/"{common,main,state}"/full_schemas/$SCHEMA_NUMBER"
+sqlite3 "$SQLITE_COMMON_DB" ".schema"                    > "$OUTPUT_DIR/common/full_schemas/$SCHEMA_NUMBER/full.sql.sqlite"
+sqlite3 "$SQLITE_COMMON_DB" ".dump --data-only --nosys" >> "$OUTPUT_DIR/common/full_schemas/$SCHEMA_NUMBER/full.sql.sqlite"
+sqlite3 "$SQLITE_MAIN_DB"   ".schema"                    > "$OUTPUT_DIR/main/full_schemas/$SCHEMA_NUMBER/full.sql.sqlite"
+sqlite3 "$SQLITE_MAIN_DB"   ".dump --data-only --nosys" >> "$OUTPUT_DIR/main/full_schemas/$SCHEMA_NUMBER/full.sql.sqlite"
+sqlite3 "$SQLITE_STATE_DB"  ".schema"                    > "$OUTPUT_DIR/state/full_schemas/$SCHEMA_NUMBER/full.sql.sqlite"
+sqlite3 "$SQLITE_STATE_DB"  ".dump --data-only --nosys" >> "$OUTPUT_DIR/state/full_schemas/$SCHEMA_NUMBER/full.sql.sqlite"

 cleanup_pg_schema() {
-   sed -e '/^$/d' -e '/^--/d' -e 's/public\.//g' -e '/^SET /d' -e '/^SELECT /d'
+  # Cleanup as follows:
+  # - Remove empty lines. pg_dump likes to output a lot of these.
+  # - Remove comment-only lines. pg_dump also likes to output a lot of these to visually
+  #   separate tables etc.
+  # - Remove "public." prefix --- the schema name.
+  # - Remove "SET" commands. Last time I ran this, the output commands were
+  #     SET statement_timeout = 0;
+  #     SET lock_timeout = 0;
+  #     SET idle_in_transaction_session_timeout = 0;
+  #     SET client_encoding = 'UTF8';
+  #     SET standard_conforming_strings = on;
+  #     SET check_function_bodies = false;
+  #     SET xmloption = content;
+  #     SET client_min_messages = warning;
+  #     SET row_security = off;
+  #     SET default_table_access_method = heap;
+  # - Very carefully remove specific SELECT statements. We CANNOT blanket remove all
+  #   SELECT statements because some of those have side-effects which we do want in the
+  #   schema. Last time I ran this, the only SELECTS were
+  #     SELECT pg_catalog.set_config('search_path', '', false);
+  #   and
+  #     SELECT pg_catalog.setval(text, bigint, bool);
+  #   We do want to remove the former, but the latter is important. If the last argument
+  #   is `true` or omitted, this marks the given integer as having been consumed and
+  #   will NOT appear as the nextval.
+   sed -e '/^$/d' \
+   -e '/^--/d' \
+   -e 's/public\.//g' \
+   -e '/^SET /d' \
+   -e '/^SELECT pg_catalog.set_config/d'
 }

 echo "Dumping Postgres schema..."

-pg_dump --format=plain --schema-only         --no-tablespaces --no-acl --no-owner "$POSTGRES_COMMON_DB_NAME" | cleanup_pg_schema  > "$OUTPUT_DIR/common/full_schema/$SCHEMA_NUMBER/full.sql.postgres"
-pg_dump --format=plain --data-only --inserts --no-tablespaces --no-acl --no-owner "$POSTGRES_COMMON_DB_NAME" | cleanup_pg_schema >> "$OUTPUT_DIR/common/full_schema/$SCHEMA_NUMBER/full.sql.postgres"
-pg_dump --format=plain --schema-only         --no-tablespaces --no-acl --no-owner "$POSTGRES_MAIN_DB_NAME"   | cleanup_pg_schema  > "$OUTPUT_DIR/main/full_schema/$SCHEMA_NUMBER/full.sql.postgres"
-pg_dump --format=plain --data-only --inserts --no-tablespaces --no-acl --no-owner "$POSTGRES_MAIN_DB_NAME"   | cleanup_pg_schema >> "$OUTPUT_DIR/main/full_schema/$SCHEMA_NUMBER/full.sql.postgres"
-pg_dump --format=plain --schema-only         --no-tablespaces --no-acl --no-owner "$POSTGRES_STATE_DB_NAME"  | cleanup_pg_schema  > "$OUTPUT_DIR/state/full_schema/$SCHEMA_NUMBER/full.sql.postgres"
-pg_dump --format=plain --data-only --inserts --no-tablespaces --no-acl --no-owner "$POSTGRES_STATE_DB_NAME"  | cleanup_pg_schema >> "$OUTPUT_DIR/state/full_schema/$SCHEMA_NUMBER/full.sql.postgres"
+pg_dump --format=plain --schema-only         --no-tablespaces --no-acl --no-owner "$POSTGRES_COMMON_DB_NAME" | cleanup_pg_schema  > "$OUTPUT_DIR/common/full_schemas/$SCHEMA_NUMBER/full.sql.postgres"
+pg_dump --format=plain --data-only --inserts --no-tablespaces --no-acl --no-owner "$POSTGRES_COMMON_DB_NAME" | cleanup_pg_schema >> "$OUTPUT_DIR/common/full_schemas/$SCHEMA_NUMBER/full.sql.postgres"
+pg_dump --format=plain --schema-only         --no-tablespaces --no-acl --no-owner "$POSTGRES_MAIN_DB_NAME"   | cleanup_pg_schema  > "$OUTPUT_DIR/main/full_schemas/$SCHEMA_NUMBER/full.sql.postgres"
+pg_dump --format=plain --data-only --inserts --no-tablespaces --no-acl --no-owner "$POSTGRES_MAIN_DB_NAME"   | cleanup_pg_schema >> "$OUTPUT_DIR/main/full_schemas/$SCHEMA_NUMBER/full.sql.postgres"
+pg_dump --format=plain --schema-only         --no-tablespaces --no-acl --no-owner "$POSTGRES_STATE_DB_NAME"  | cleanup_pg_schema  > "$OUTPUT_DIR/state/full_schemas/$SCHEMA_NUMBER/full.sql.postgres"
+pg_dump --format=plain --data-only --inserts --no-tablespaces --no-acl --no-owner "$POSTGRES_STATE_DB_NAME"  | cleanup_pg_schema >> "$OUTPUT_DIR/state/full_schemas/$SCHEMA_NUMBER/full.sql.postgres"

 echo "Done! Files dumped to: $OUTPUT_DIR"
--- a/synapse/storage/database.py
+++ b/synapse/storage/database.py
@@ -393,6 +393,14 @@ class LoggingTransaction:
    def executemany(self, sql: str, *args: Any) -> None:
        self._do_execute(self.txn.executemany, sql, *args)

+    def executescript(self, sql: str) -> None:
+        if isinstance(self.database_engine, Sqlite3Engine):
+            self._do_execute(self.txn.executescript, sql)  # type: ignore[attr-defined]
+        else:
+            raise NotImplementedError(
+                f"executescript only exists for sqlite driver, not {type(self.database_engine)}"
+            )
+
    def _make_sql_one_line(self, sql: str) -> str:
        "Strip newlines out of SQL so that the loggers in the DB are on one line"
        return " ".join(line.strip() for line in sql.splitlines() if line.strip())

--- a/synapse/storage/engines/_base.py
+++ b/synapse/storage/engines/_base.py
@@ -32,9 +32,10 @@ class IncorrectDatabaseSetup(RuntimeError):


 ConnectionType = TypeVar("ConnectionType", bound=Connection)
+CursorType = TypeVar("CursorType", bound=Cursor)


-class BaseDatabaseEngine(Generic[ConnectionType], metaclass=abc.ABCMeta):
+class BaseDatabaseEngine(Generic[ConnectionType, CursorType], metaclass=abc.ABCMeta):
    def __init__(self, module: DBAPI2Module, config: Mapping[str, Any]):
        self.module = module

@@ -64,7 +65,7 @@ class BaseDatabaseEngine(Generic[ConnectionType], metaclass=abc.ABCMeta):
        ...

    @abc.abstractmethod
-    def check_new_database(self, txn: Cursor) -> None:
+    def check_new_database(self, txn: CursorType) -> None:
        """Gets called when setting up a brand new database. This allows us to
        apply stricter checks on new databases versus existing database.
        """
@@ -124,3 +125,21 @@ class BaseDatabaseEngine(Generic[ConnectionType], metaclass=abc.ABCMeta):
        Note: This has no effect on SQLite3, as transactions are SERIALIZABLE by default.
        """
        ...
+
+    @staticmethod
+    @abc.abstractmethod
+    def executescript(cursor: CursorType, script: str) -> None:
+        """Execute a chunk of SQL containing multiple semicolon-delimited statements.
+
+        This is not provided by DBAPI2, and so needs engine-specific support.
+        """
+        ...
+
+    @classmethod
+    def execute_script_file(cls, cursor: CursorType, filepath: str) -> None:
+        """Execute a file containing multiple semicolon-delimited SQL statements.
+
+        This is not provided by DBAPI2, and so needs engine-specific support.
+        """
+        with open(filepath, "rt") as f:
+            cls.executescript(cursor, f.read())
--- a/synapse/storage/engines/postgres.py
+++ b/synapse/storage/engines/postgres.py
@@ -31,7 +31,9 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)


-class PostgresEngine(BaseDatabaseEngine[psycopg2.extensions.connection]):
+class PostgresEngine(
+    BaseDatabaseEngine[psycopg2.extensions.connection, psycopg2.extensions.cursor]
+):
    def __init__(self, database_config: Mapping[str, Any]):
        super().__init__(psycopg2, database_config)
        psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
@@ -212,3 +214,11 @@ class PostgresEngine(BaseDatabaseEngine[psycopg2.extensions.connection]):
        else:
            isolation_level = self.isolation_level_map[isolation_level]
        return conn.set_isolation_level(isolation_level)
+
+    @staticmethod
+    def executescript(cursor: psycopg2.extensions.cursor, script: str) -> None:
+        """Execute a chunk of SQL containing multiple semicolon-delimited statements.
+
+        Psycopg2 seems happy to do this in DBAPI2's `execute()` function.
+        """
+        cursor.execute(script)
--- a/synapse/storage/engines/sqlite.py
+++ b/synapse/storage/engines/sqlite.py
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
    from synapse.storage.database import LoggingDatabaseConnection


-class Sqlite3Engine(BaseDatabaseEngine[sqlite3.Connection]):
+class Sqlite3Engine(BaseDatabaseEngine[sqlite3.Connection, sqlite3.Cursor]):
    def __init__(self, database_config: Mapping[str, Any]):
        super().__init__(sqlite3, database_config)

@@ -120,6 +120,25 @@ class Sqlite3Engine(BaseDatabaseEngine[sqlite3.Connection]):
        # All transactions are SERIALIZABLE by default in sqlite
        pass

+    @staticmethod
+    def executescript(cursor: sqlite3.Cursor, script: str) -> None:
+        """Execute a chunk of SQL containing multiple semicolon-delimited statements.
+
+        Python's built-in SQLite driver does not allow you to do this with DBAPI2's
+        `execute`:
+
+        > execute() will only execute a single SQL statement. If you try to execute more
+        > than one statement with it, it will raise a Warning. Use executescript() if
+        > you want to execute multiple SQL statements with one call.
+
+        Though the docs for `executescript` warn:
+
+        > If there is a pending transaction, an implicit COMMIT statement is executed
+        > first. No other implicit transaction control is performed; any transaction
+        > control must be added to sql_script.
+        """
+        cursor.executescript(script)
+

 # Following functions taken from: https://github.com/coleifer/peewee


--- a/synapse/storage/prepare_database.py
+++ b/synapse/storage/prepare_database.py
@@ -266,7 +266,7 @@ def _setup_new_database(
            ".sql." + specific
        ):
            logger.debug("Applying schema %s", entry.absolute_path)
-            executescript(cur, entry.absolute_path)
+            database_engine.execute_script_file(cur, entry.absolute_path)

    cur.execute(
        "INSERT INTO schema_version (version, upgraded) VALUES (?,?)",
@@ -517,7 +517,7 @@ def _upgrade_existing_database(
                        UNAPPLIED_DELTA_ON_WORKER_ERROR % relative_path
                    )
                logger.info("Applying schema %s", relative_path)
-                executescript(cur, absolute_path)
+                database_engine.execute_script_file(cur, absolute_path)
            elif ext == specific_engine_extension and root_name.endswith(".sql"):
                # A .sql file specific to our engine; just read and execute it
                if is_worker:
@@ -525,7 +525,7 @@ def _upgrade_existing_database(
                        UNAPPLIED_DELTA_ON_WORKER_ERROR % relative_path
                    )
                logger.info("Applying engine-specific schema %s", relative_path)
-                executescript(cur, absolute_path)
+                database_engine.execute_script_file(cur, absolute_path)
            elif ext in specific_engine_extensions and root_name.endswith(".sql"):
                # A .sql file for a different engine; skip it.
                continue
@@ -666,7 +666,7 @@ def _get_or_create_schema_state(
 ) -> Optional[_SchemaState]:
    # Bluntly try creating the schema_version tables.
    sql_path = os.path.join(schema_path, "common", "schema_version.sql")
-    executescript(txn, sql_path)
+    database_engine.execute_script_file(txn, sql_path)

    txn.execute("SELECT version, upgraded FROM schema_version")
    row = txn.fetchone()

--- a/synapse/storage/schema/common/full_schemas/72/full.sql.postgres
+++ b/synapse/storage/schema/common/full_schemas/72/full.sql.postgres
+CREATE TABLE background_updates (
+    update_name text NOT NULL,
+    progress_json text NOT NULL,
+    depends_on text,
+    ordering integer DEFAULT 0 NOT NULL
+);
+ALTER TABLE ONLY background_updates
+    ADD CONSTRAINT background_updates_uniqueness UNIQUE (update_name);
--- a/synapse/storage/schema/common/full_schemas/72/full.sql.sqlite
+++ b/synapse/storage/schema/common/full_schemas/72/full.sql.sqlite
+CREATE TABLE background_updates (
+    update_name text NOT NULL,
+    progress_json text NOT NULL,
+    depends_on text, ordering INT NOT NULL DEFAULT 0,
+    CONSTRAINT background_updates_uniqueness UNIQUE (update_name)
+);
--- a/synapse/storage/schema/main/full_schemas/72/full.sql.postgres
+++ b/synapse/storage/schema/main/full_schemas/72/full.sql.postgres
--- a/synapse/storage/schema/main/full_schemas/72/full.sql.sqlite
+++ b/synapse/storage/schema/main/full_schemas/72/full.sql.sqlite
--- a/synapse/storage/schema/state/full_schemas/72/full.sql.postgres
+++ b/synapse/storage/schema/state/full_schemas/72/full.sql.postgres
+CREATE TABLE state_group_edges (
+    state_group bigint NOT NULL,
+    prev_state_group bigint NOT NULL
+);
+CREATE SEQUENCE state_group_id_seq
+    START WITH 1
+    INCREMENT BY 1
+    NO MINVALUE
+    NO MAXVALUE
+    CACHE 1;
+CREATE TABLE state_groups (
+    id bigint NOT NULL,
+    room_id text NOT NULL,
+    event_id text NOT NULL
+);
+CREATE TABLE state_groups_state (
+    state_group bigint NOT NULL,
+    room_id text NOT NULL,
+    type text NOT NULL,
+    state_key text NOT NULL,
+    event_id text NOT NULL
+);
+ALTER TABLE ONLY state_groups_state ALTER COLUMN state_group SET (n_distinct=-0.02);
+ALTER TABLE ONLY state_groups
+    ADD CONSTRAINT state_groups_pkey PRIMARY KEY (id);
+CREATE INDEX state_group_edges_prev_idx ON state_group_edges USING btree (prev_state_group);
+CREATE UNIQUE INDEX state_group_edges_unique_idx ON state_group_edges USING btree (state_group, prev_state_group);
+CREATE INDEX state_groups_room_id_idx ON state_groups USING btree (room_id);
+CREATE INDEX state_groups_state_type_idx ON state_groups_state USING btree (state_group, type, state_key);
+SELECT pg_catalog.setval('state_group_id_seq', 1, false);
--- a/synapse/storage/schema/state/full_schemas/72/full.sql.sqlite
+++ b/synapse/storage/schema/state/full_schemas/72/full.sql.sqlite
+CREATE TABLE state_groups (
+    id BIGINT PRIMARY KEY,
+    room_id TEXT NOT NULL,
+    event_id TEXT NOT NULL
+);
+CREATE TABLE state_groups_state (
+    state_group BIGINT NOT NULL,
+    room_id TEXT NOT NULL,
+    type TEXT NOT NULL,
+    state_key TEXT NOT NULL,
+    event_id TEXT NOT NULL
+);
+CREATE TABLE state_group_edges (
+    state_group BIGINT NOT NULL,
+    prev_state_group BIGINT NOT NULL
+);
+CREATE INDEX state_group_edges_prev_idx ON state_group_edges (prev_state_group);
+CREATE INDEX state_groups_state_type_idx ON state_groups_state (state_group, type, state_key);
+CREATE INDEX state_groups_room_id_idx ON state_groups (room_id) ;
+CREATE UNIQUE INDEX state_group_edges_unique_idx ON state_group_edges (state_group, prev_state_group) ;