feat: postgres prep on start

This commit is contained in:
Alex
2026-04-17 20:45:22 +01:00
parent 66541e934b
commit e103799f81
11 changed files with 501 additions and 141 deletions

View File

@@ -1,3 +1,4 @@
import logging
import os
import platform
import uuid
@@ -20,6 +21,7 @@ from application.api.connector.routes import connector # noqa: E402
from application.api.v1 import v1_bp # noqa: E402
from application.celery_init import celery # noqa: E402
from application.core.settings import settings # noqa: E402
from application.storage.db.bootstrap import ensure_database_ready # noqa: E402
from application.stt.upload_limits import ( # noqa: E402
build_stt_file_size_limit_message,
should_reject_stt_request,
@@ -32,6 +34,17 @@ if platform.system() == "Windows":
pathlib.PosixPath = pathlib.WindowsPath
dotenv.load_dotenv()
# Self-bootstrap the user-data Postgres DB. Runs before any blueprint or
# repository touches the engine, so the first request can't race the
# schema being created. Gated by AUTO_CREATE_DB / AUTO_MIGRATE settings
# (default ON for dev; disable in prod if schema is managed out-of-band).
ensure_database_ready(
settings.POSTGRES_URI,
create_db=settings.AUTO_CREATE_DB,
migrate=settings.AUTO_MIGRATE,
logger=logging.getLogger("application.app"),
)
app = Flask(__name__)
app.register_blueprint(user)
app.register_blueprint(answer)

View File

@@ -30,6 +30,10 @@ class Settings(BaseSettings):
MONGO_URI: Optional[str] = None
# User-data Postgres DB.
POSTGRES_URI: Optional[str] = None
# On app startup, apply pending Alembic migrations. Default ON for dev; disable in prod if you manage schema out-of-band.
AUTO_MIGRATE: bool = True
# On app startup, create the target Postgres database if it's missing (requires CREATEDB privilege). Dev-friendly default.
AUTO_CREATE_DB: bool = True
LLM_PATH: str = os.path.join(current_dir, "models/docsgpt-7b-f16.gguf")
DEFAULT_MAX_HISTORY: int = 150
DEFAULT_LLM_TOKEN_LIMIT: int = 128000 # Fallback when model not found in registry

View File

@@ -0,0 +1,320 @@
"""Self-bootstrapping database setup for the DocsGPT user-data Postgres DB.
On app startup the Flask factory (and Celery worker init) can call
:func:`ensure_database_ready` to:
1. Create the target database if it's missing (dev-friendly; requires the
configured role to have ``CREATEDB`` privilege).
2. Apply every pending Alembic migration up to ``head``.
Both steps are gated by settings that default ON for dev convenience and
can be turned off in prod (``AUTO_CREATE_DB`` / ``AUTO_MIGRATE``) where
schema is managed out-of-band by a deploy pipeline.
All heavy imports (alembic, psycopg, sqlalchemy.exc sub-symbols) are
deferred to inside the function so merely importing this module has no
side effects and is cheap for test collection.
"""
from __future__ import annotations
import logging
from typing import Optional
def ensure_database_ready(
uri: Optional[str],
*,
create_db: bool,
migrate: bool,
logger: Optional[logging.Logger] = None,
) -> None:
"""Make sure the target Postgres DB exists and is migrated to ``head``.
This is idempotent and safe to call once per process. Each step is
independently gated so prod deployments that manage schema externally
can disable the migrate step while still allowing the process to boot
against an already-provisioned database.
Args:
uri: SQLAlchemy URI for the user-data Postgres database. If
``None`` or empty, the function logs and returns — the app
supports running without a configured URI for certain dev
flows that don't touch user data.
create_db: If ``True``, auto-create the database when it's
missing. Requires the configured role to have ``CREATEDB``.
migrate: If ``True``, run ``alembic upgrade head`` after the
database is reachable.
logger: Optional logger to use. Defaults to this module's logger.
Raises:
Exception: Any failure in an explicitly-enabled step is re-raised
so the app fails fast rather than booting into a broken state.
Missing-role / auth errors surface cleanly without a
mis-directed auto-create attempt.
"""
log = logger or logging.getLogger(__name__)
if not uri:
log.info(
"ensure_database_ready: POSTGRES_URI is not set; "
"skipping database bootstrap."
)
return
if create_db:
_ensure_database_exists(uri, log)
if migrate:
_run_migrations(log)
def _ensure_database_exists(uri: str, log: logging.Logger) -> None:
"""Create the target database if a connection reveals it's missing.
We probe with a lightweight ``connect().close()``. If Postgres
reports ``InvalidCatalogName`` (SQLSTATE ``3D000``), we reconnect to
the server's ``postgres`` maintenance DB and issue ``CREATE DATABASE``
in AUTOCOMMIT mode (required — CREATE DATABASE can't run in a
transaction). Any other connection failure (bad host, auth failure,
missing role) is re-raised untouched so the operator sees the true
cause instead of a mis-directed auto-create attempt.
"""
# Lazy imports keep module import side-effect free.
from sqlalchemy import create_engine
from sqlalchemy.engine import make_url
from sqlalchemy.exc import OperationalError
url = make_url(uri)
target_db = url.database
if not target_db:
raise RuntimeError(
f"POSTGRES_URI is missing a database name: {uri!r}. "
"Expected something like "
"'postgresql+psycopg://user:pass@host:5432/docsgpt'."
)
probe_engine = create_engine(uri, pool_pre_ping=False)
try:
try:
conn = probe_engine.connect()
except OperationalError as exc:
if _is_missing_database(exc):
log.info(
"ensure_database_ready: database %r is missing; "
"creating it...",
target_db,
)
_create_database(url, target_db, log)
log.info("ensure_database_ready: database %r ready.", target_db)
return
# Not a missing-DB error — surface it as-is. This is the path
# for bad host/auth/role-missing, and auto-creating would be
# actively wrong there.
log.error(
"ensure_database_ready: cannot connect to Postgres for "
"database %r: %s",
target_db,
exc,
)
raise
else:
conn.close()
log.info("ensure_database_ready: database %r ready.", target_db)
finally:
probe_engine.dispose()
def _create_database(url, target_db: str, log: logging.Logger) -> None:
"""Issue ``CREATE DATABASE`` against the server's ``postgres`` DB.
Uses AUTOCOMMIT (required by Postgres — ``CREATE DATABASE`` cannot run
inside a transaction). The database identifier is quoted via
``psycopg.sql.Identifier`` so unusual names (hyphens, reserved words)
are handled correctly.
Args:
url: Parsed SQLAlchemy URL for the target DB; we reuse
host/port/credentials and swap the database to ``postgres``.
target_db: The target database name to create.
log: Logger for INFO/ERROR breadcrumbs.
"""
from sqlalchemy import create_engine
from sqlalchemy.exc import OperationalError, ProgrammingError
# psycopg is imported lazily — its error classes are the canonical
# cause markers Postgres hands us back.
import psycopg
from psycopg import sql as pg_sql
maintenance_url = url.set(database="postgres")
maintenance_engine = create_engine(
maintenance_url,
isolation_level="AUTOCOMMIT",
pool_pre_ping=False,
)
try:
with maintenance_engine.connect() as conn:
# Use psycopg's Identifier to quote the DB name safely. The
# SQL object renders as a literal ``CREATE DATABASE "<name>"``
# which SQLAlchemy passes through to psycopg verbatim.
stmt = pg_sql.SQL("CREATE DATABASE {}").format(
pg_sql.Identifier(target_db)
)
raw = conn.connection.dbapi_connection # psycopg connection
with raw.cursor() as cur:
try:
cur.execute(stmt)
except psycopg.errors.DuplicateDatabase:
# Another worker won the race — benign.
log.info(
"ensure_database_ready: database %r already "
"created by a concurrent worker; continuing.",
target_db,
)
except psycopg.errors.InsufficientPrivilege as exc:
log.error(
"ensure_database_ready: role lacks CREATEDB "
"privilege to create %r. Either GRANT CREATEDB "
"to the role, create the database manually, or "
"set AUTO_CREATE_DB=False and provision it "
"out-of-band. See docs/Deploying/Postgres-"
"Migration for guidance. Underlying error: %s",
target_db,
exc,
)
raise
except (OperationalError, ProgrammingError) as exc:
log.error(
"ensure_database_ready: failed to create database %r: %s. "
"See docs/Deploying/Postgres-Migration for manual setup.",
target_db,
exc,
)
raise
finally:
maintenance_engine.dispose()
def _is_missing_database(exc: Exception) -> bool:
"""Return True if ``exc`` indicates the target database doesn't exist.
We check three signals in the cause chain:
1. ``psycopg.errors.InvalidCatalogName`` — the canonical class for
SQLSTATE ``3D000`` when raised during a query.
2. ``pgcode`` / ``diag.sqlstate`` equal to ``3D000`` — defensive, for
driver versions that surface the code on a generic class.
3. The canonical server message phrasing ``database "..." does not
exist`` — **required** for connection-time failures, because
psycopg 3's ``OperationalError`` raised by ``connect()`` does NOT
populate ``sqlstate`` (the connection never completed the protocol
handshake, so the attributes stay ``None``). The server's error
message itself is stable across Postgres versions, so this is a
reliable fallback for the only case that matters: DB missing at
boot.
"""
try:
import psycopg
invalid_catalog = psycopg.errors.InvalidCatalogName
except Exception: # noqa: BLE001 — defensive; never break on import
invalid_catalog = None
seen: set[int] = set()
cursor: Optional[BaseException] = exc
while cursor is not None and id(cursor) not in seen:
seen.add(id(cursor))
if invalid_catalog is not None and isinstance(cursor, invalid_catalog):
return True
pgcode = getattr(cursor, "pgcode", None) or getattr(
getattr(cursor, "diag", None), "sqlstate", None
)
if pgcode == "3D000":
return True
msg = str(cursor)
if 'database "' in msg and "does not exist" in msg:
return True
cursor = cursor.__cause__ or cursor.__context__
return False
def _run_migrations(log: logging.Logger) -> None:
"""Run ``alembic upgrade head`` against ``POSTGRES_URI``.
Alembic serializes concurrent workers via its ``alembic_version``
table, so no extra application-level locking is needed. Failures are
logged and re-raised so the app fails fast.
"""
from pathlib import Path
# Lazy imports — alembic pulls in a fair amount of code.
from alembic import command
from alembic.config import Config
from alembic.runtime.migration import MigrationContext
from alembic.script import ScriptDirectory
from sqlalchemy import create_engine
# Mirror the discovery path used by scripts/db/init_postgres.py so
# both entry points resolve the same alembic.ini regardless of cwd.
alembic_ini = Path(__file__).resolve().parents[2] / "alembic.ini"
if not alembic_ini.exists():
raise RuntimeError(f"alembic.ini not found at {alembic_ini}")
cfg = Config(str(alembic_ini))
cfg.set_main_option("script_location", str(alembic_ini.parent / "alembic"))
# Cheap pre-check: if we're already at head, say so explicitly.
try:
script = ScriptDirectory.from_config(cfg)
head_rev = script.get_current_head()
url = cfg.get_main_option("sqlalchemy.url")
# env.py populates sqlalchemy.url from settings.POSTGRES_URI when
# it's imported, but our Config instance hasn't loaded env.py
# yet. Fall back to reading settings directly for the precheck.
if not url:
from application.core.settings import settings as _settings
url = _settings.POSTGRES_URI
current_rev: Optional[str] = None
if url:
precheck_engine = create_engine(url, pool_pre_ping=False)
try:
with precheck_engine.connect() as conn:
ctx = MigrationContext.configure(conn)
current_rev = ctx.get_current_revision()
finally:
precheck_engine.dispose()
if current_rev is not None and current_rev == head_rev:
log.info(
"ensure_database_ready: migrations already at head (%s); "
"nothing to do.",
head_rev,
)
return
log.info(
"ensure_database_ready: applying Alembic migrations "
"(current=%s, target=%s)...",
current_rev,
head_rev,
)
except Exception as exc: # noqa: BLE001 — precheck is best-effort
# If the precheck itself fails we still want to try the upgrade;
# alembic will give a more actionable error if something's off.
log.info(
"ensure_database_ready: revision precheck failed (%s); "
"proceeding with upgrade anyway.",
exc,
)
try:
command.upgrade(cfg, "head")
except Exception as exc: # noqa: BLE001 — surface everything
log.error(
"ensure_database_ready: alembic upgrade failed: %s. "
"Check migration logs and DB connectivity; the app will not "
"boot until this is resolved (or AUTO_MIGRATE is disabled).",
exc,
)
raise
log.info("ensure_database_ready: migrations applied.")

View File

@@ -27,8 +27,8 @@ services:
depends_on:
redis:
condition: service_started
postgres-init:
condition: service_completed_successfully
postgres:
condition: service_healthy
worker:
build: ../application
@@ -44,8 +44,8 @@ services:
depends_on:
redis:
condition: service_started
postgres-init:
condition: service_completed_successfully
postgres:
condition: service_healthy
redis:
image: redis:6-alpine
@@ -68,17 +68,5 @@ services:
timeout: 5s
retries: 10
postgres-init:
build: ../application
command: python scripts/db/init_postgres.py
env_file:
- ../.env
environment:
- POSTGRES_URI=postgresql://docsgpt:docsgpt@postgres:5432/docsgpt
depends_on:
postgres:
condition: service_healthy
restart: "no"
volumes:
postgres_data:

View File

@@ -32,8 +32,8 @@ services:
depends_on:
redis:
condition: service_started
postgres-init:
condition: service_completed_successfully
postgres:
condition: service_healthy
worker:
@@ -55,8 +55,8 @@ services:
depends_on:
redis:
condition: service_started
postgres-init:
condition: service_completed_successfully
postgres:
condition: service_healthy
redis:
image: redis:6-alpine
@@ -79,17 +79,5 @@ services:
timeout: 5s
retries: 10
postgres-init:
image: arc53/docsgpt:develop
command: python scripts/db/init_postgres.py
env_file:
- ../.env
environment:
- POSTGRES_URI=postgresql://docsgpt:docsgpt@postgres:5432/docsgpt
depends_on:
postgres:
condition: service_healthy
restart: "no"
volumes:
postgres_data:

View File

@@ -33,8 +33,8 @@ services:
depends_on:
redis:
condition: service_started
postgres-init:
condition: service_completed_successfully
postgres:
condition: service_healthy
worker:
user: root
@@ -56,8 +56,8 @@ services:
depends_on:
redis:
condition: service_started
postgres-init:
condition: service_completed_successfully
postgres:
condition: service_healthy
redis:
image: redis:6-alpine
@@ -80,21 +80,6 @@ services:
timeout: 5s
retries: 10
# One-shot migrator: runs alembic upgrade head, then exits. The backend
# and worker services wait for it via `service_completed_successfully`,
# so they never see a partially-migrated schema.
postgres-init:
build: ../application
command: python scripts/db/init_postgres.py
env_file:
- ../.env
environment:
- POSTGRES_URI=postgresql://docsgpt:docsgpt@postgres:5432/docsgpt
depends_on:
postgres:
condition: service_healthy
restart: "no"
volumes:
postgres_data:

View File

@@ -50,6 +50,13 @@ spec:
secretKeyRef:
name: docsgpt-secrets
key: POSTGRES_URI
# Disable in-app auto-bootstrap. The `postgres-init` Job under
# deployment/k8s/jobs/ owns schema creation and Alembic migrations,
# so application pods must not race with it on rollout.
- name: AUTO_MIGRATE
value: "false"
- name: AUTO_CREATE_DB
value: "false"
---
apiVersion: apps/v1
kind: Deployment
@@ -97,6 +104,13 @@ spec:
secretKeyRef:
name: docsgpt-secrets
key: POSTGRES_URI
# Disable in-app auto-bootstrap. The `postgres-init` Job under
# deployment/k8s/jobs/ owns schema creation and Alembic migrations,
# so application pods must not race with it on rollout.
- name: AUTO_MIGRATE
value: "false"
- name: AUTO_CREATE_DB
value: "false"
---
apiVersion: apps/v1
kind: Deployment

View File

@@ -11,7 +11,7 @@ This guide will walk you through setting up a development environment for DocsGP
## 1. Spin Up Postgres and Redis
For development purposes, you can quickly start Postgres and Redis containers. Postgres is the user-data store for DocsGPT (conversations, agents, prompts, sources, attachments, workflows, logs, and token usage), and Redis is used as the cache and Celery broker. We provide a dedicated Docker Compose file, `docker-compose-dev.yaml`, located in the `deployment` directory, that includes only these essential services along with a one-shot `postgres-init` migrator that applies the Alembic schema.
For development purposes, you can quickly start Postgres and Redis containers. Postgres is the user-data store for DocsGPT (conversations, agents, prompts, sources, attachments, workflows, logs, and token usage), and Redis is used as the cache and Celery broker. We provide a dedicated Docker Compose file, `docker-compose-dev.yaml`, located in the `deployment` directory, that includes only these essential services. The backend applies the Alembic schema automatically on first boot (`AUTO_MIGRATE=true` / `AUTO_CREATE_DB=true` ship enabled), so no separate migration step is required. You can still run `python scripts/db/init_postgres.py` explicitly if you prefer.
You can find the `docker-compose-dev.yaml` file [here](https://github.com/arc53/DocsGPT/blob/main/deployment/docker-compose-dev.yaml).
@@ -26,7 +26,7 @@ You can find the `docker-compose-dev.yaml` file [here](https://github.com/arc53/
docker compose -f deployment/docker-compose-dev.yaml up -d
```
These commands will start Postgres and Redis in detached mode, running in the background. The `postgres-init` service runs once against the fresh database and then exits.
These commands will start Postgres and Redis in detached mode, running in the background. When the Flask backend boots against the fresh Postgres instance, it will automatically create the database (if missing) and apply the current Alembic schema.
<Callout type="info" emoji="">
MongoDB is no longer required for a default DocsGPT install. If you

View File

@@ -248,6 +248,8 @@ DocsGPT stores user data — conversations, agents, prompts, sources, attachment
| Setting | Description | Default |
| --- | --- | --- |
| `POSTGRES_URI` | SQLAlchemy-compatible Postgres URI. Any standard `postgresql://` form works — DocsGPT normalizes it internally to the `psycopg` v3 dialect. | — |
| `AUTO_CREATE_DB` | On startup, connect to the server's `postgres` maintenance DB and issue `CREATE DATABASE` if the target is missing. Requires `CREATEDB` or superuser. No-op when the database already exists. Disable in production. | `true` |
| `AUTO_MIGRATE` | On startup, run `alembic upgrade head` against the target database. Idempotent and serialized across workers via `alembic_version`. Disable in production in favor of an explicit migration step. | `true` |
Example:
@@ -256,13 +258,19 @@ POSTGRES_URI=postgresql://docsgpt:docsgpt@localhost:5432/docsgpt
# Append ?sslmode=require for managed providers that enforce SSL.
```
Apply the schema once (idempotent):
With the defaults, the app applies the schema automatically on first
boot. To run it explicitly instead (e.g., in CI/CD or a k8s `Job`):
```bash
python scripts/db/init_postgres.py
```
The default Docker Compose file bundles a `postgres` service plus a one-shot `postgres-init` migrator, so you don't have to run this by hand for containerized deployments.
The default Docker Compose file bundles a `postgres` service, and the
app auto-bootstraps the database on boot, so containerized deployments
need no manual migration step. See
[PostgreSQL for User Data](/Deploying/Postgres-Migration#production-hardening)
for the recommended production flow (both flags `false`, migrations
gated by CI/CD).
<Callout type="info" emoji="">
`MONGO_URI` is **opt-in**. It is only consulted when you select the

View File

@@ -1,126 +1,151 @@
---
title: PostgreSQL for User Data
description: PostgreSQL is the user-data store for DocsGPT. This page covers fresh installs and the one-shot migration from legacy MongoDB deployments.
description: PostgreSQL is the user-data store for DocsGPT. Covers auto-bootstrap, production hardening, and the one-shot migration from legacy MongoDB deployments.
---
import { Callout } from 'nextra/components'
# PostgreSQL for User Data
DocsGPT uses **PostgreSQL** as the user-data store for conversations,
agents, prompts, sources, attachments, workflows, logs, token usage,
and the rest of the application's structured state. MongoDB is no
longer required for a default install.
DocsGPT stores conversations, agents, prompts, sources, attachments,
workflows, logs, and token usage in **PostgreSQL**. MongoDB is no longer
required.
<Callout type="info" emoji="">
Vector stores are independent from user-data storage. `VECTOR_STORE`
can still be `pgvector`, `faiss`, `qdrant`, `milvus`, `elasticsearch`,
or `mongodb` (Mongo Atlas Vector Search) — your choice there does not
affect this page.
Vector stores are independent — `VECTOR_STORE` can still be `pgvector`,
`faiss`, `qdrant`, `milvus`, `elasticsearch`, or `mongodb`.
</Callout>
## Fresh install
## Quickstart
1. **Run Postgres 13+.** Native install, Docker, or managed (Neon, RDS,
Supabase, Cloud SQL…) — all work. The default Docker Compose file
ships a `postgres` service plus a one-shot `postgres-init` migrator
that applies the schema automatically.
Three common paths. Each assumes Postgres 13+ and the default env vars
`AUTO_MIGRATE=true` / `AUTO_CREATE_DB=true` (both ship enabled).
2. **Create a database and role** (skip if your managed provider gave
you these, or if you're using the bundled compose `postgres`
service):
### Docker Compose
```sql
CREATE ROLE docsgpt LOGIN PASSWORD 'docsgpt';
CREATE DATABASE docsgpt OWNER docsgpt;
```
The bundled compose file ships a `postgres` service. App boot handles the
rest — no sidecar, no init job.
3. **Set `POSTGRES_URI` in `.env`.** Any standard Postgres URI works —
DocsGPT normalizes it internally to the SQLAlchemy `psycopg` (v3)
dialect.
```bash
cd deployment && docker compose up
```
```bash
POSTGRES_URI=postgresql://docsgpt:docsgpt@localhost:5432/docsgpt
# Append ?sslmode=require for managed providers that enforce SSL.
```
### Managed Postgres (Neon, RDS, Supabase, Cloud SQL)
4. **Apply the schema** (idempotent — safe to re-run). The bundled
`postgres-init` compose service does this for you; if you're running
the backend outside compose, run it manually:
Point `POSTGRES_URI` at the provider-given URI. The app applies the
schema on first boot.
```bash
python scripts/db/init_postgres.py
# or equivalently:
alembic -c application/alembic.ini upgrade head
```
```bash
export POSTGRES_URI="postgresql://user:pass@host/docsgpt?sslmode=require"
flask --app application/app.py run --host=0.0.0.0 --port=7091
```
That's it — the backend will come up against Postgres.
### Bare-metal Postgres
## Migrating from a legacy MongoDB install
Run Postgres locally and point `POSTGRES_URI` at the default superuser.
First boot creates both the database and the schema.
If you are upgrading from an older DocsGPT deployment that stored user
data in MongoDB, a one-shot migration tool copies every collection into
Postgres. The tool is run **once**, offline, with the app stopped.
```bash
export POSTGRES_URI="postgresql://postgres@localhost/docsgpt"
flask --app application/app.py run --host=0.0.0.0 --port=7091
```
1. **Install the optional Mongo client libraries.** `pymongo` and
`dnspython` are no longer part of the default backend install;
install them directly alongside the base requirements:
Prefer a dedicated non-superuser role? Create it once as superuser — the
app never creates roles.
```bash
pip install -r application/requirements.txt
pip install 'pymongo>=4.6'
```
```sql
CREATE ROLE docsgpt LOGIN PASSWORD 'docsgpt' CREATEDB;
-- Then: POSTGRES_URI=postgresql://docsgpt:docsgpt@localhost/docsgpt
```
2. **Provision Postgres** following the [Fresh install](#fresh-install)
steps above, so `POSTGRES_URI` is set and the schema is applied.
## How auto-bootstrap works
3. **Point the backfill at both databases.** Set `MONGO_URI` in the
environment alongside `POSTGRES_URI` for the duration of the
migration:
Two env vars control startup behavior. Both default to `true` in the
app and are idempotent.
```bash
export MONGO_URI="mongodb://user:pass@host:27017/docsgpt"
export POSTGRES_URI="postgresql://docsgpt:docsgpt@localhost:5432/docsgpt"
```
| Setting | Effect | Requires |
| --- | --- | --- |
| `AUTO_CREATE_DB` | If the target database is missing, connects to the server's `postgres` maintenance DB and issues `CREATE DATABASE`. | `CREATEDB` privilege (or superuser) |
| `AUTO_MIGRATE` | Runs `alembic upgrade head` against the target database. | Table-owner or superuser on the target DB |
4. **Run the backfill.** Idempotent — re-run any time to re-sync
drifted rows. Without arguments, backfills every registered table;
pass `--tables` to limit.
Concurrent workers serialize through `alembic_version`, so rolling
restarts are safe. If the role lacks the required privilege, startup
fails fast with a clear error rather than silently skipping.
```bash
python scripts/db/backfill.py --dry-run # preview everything
python scripts/db/backfill.py # real run, everything
python scripts/db/backfill.py --tables users # only specific tables
```
<Callout type="info" emoji="">
Convenient in dev. In production, disable both and run migrations as
an explicit step — see [Production hardening](#production-hardening).
</Callout>
5. **Restart the app against Postgres only.** Unset `MONGO_URI` (or
leave it unset — it is `Optional[str] = None` in settings) and start
the backend. Nothing in the default code path consults MongoDB
anymore.
## Production hardening
Set both flags to `false` in prod and run migrations as a gated,
auditable step before rolling out the app.
```env
AUTO_MIGRATE=false
AUTO_CREATE_DB=false
```
Run migrations from your CI/CD pipeline, a Kubernetes `Job`, or an
init-container ahead of the app rollout:
```bash
python scripts/db/init_postgres.py
# equivalently:
alembic -c application/alembic.ini upgrade head
```
The reasoning: the app's runtime role shouldn't carry DDL privileges,
migrations should gate each rollout, and an explicit step is
auditable — implicit first-boot bootstrap is fine for dev but muddies
prod deploys.
<Callout type="warning" emoji="⚠️">
The backfill is a one-shot tool. There is no dual-write window and no
runtime feature flag — once you're on the current version, Postgres
is the only user-data store the backend reads from or writes to.
Migrations are not reversible by the app. Always back up production
Postgres before running `alembic upgrade head` on a new release.
</Callout>
<Callout type="info" emoji="">
Keep your MongoDB instance online until you have verified the
Postgres data is complete. You can re-run `backfill.py` at any time
to re-sync. Once you're satisfied, decommission MongoDB — unless you
also use it as your vector store (`VECTOR_STORE=mongodb`), in which
case keep it for that purpose.
## Migrating from MongoDB
One-shot, offline, app stopped. The app itself will create the
Postgres schema when it boots — you only need to run the data copy.
```bash
pip install -r application/requirements.txt
pip install 'pymongo>=4.6'
export POSTGRES_URI="postgresql://docsgpt:docsgpt@localhost:5432/docsgpt"
export MONGO_URI="mongodb://user:pass@host:27017/docsgpt"
python scripts/db/backfill.py --dry-run # preview
python scripts/db/backfill.py # real run
# or: python scripts/db/backfill.py --tables users,agents
```
Then unset `MONGO_URI` and start the backend — nothing consults Mongo
in the default path anymore. The backfill is idempotent (per-table
`ON CONFLICT` upserts, event-log tables deduped via `mongo_id`), so
re-running is safe and re-syncs any drifted rows. Keep Mongo online
until you've verified Postgres is complete; decommission afterwards
unless you still use it as a vector store.
<Callout type="warning" emoji="⚠️">
No dual-write window and no runtime flag — on the current version,
Postgres is the only user-data store the backend reads or writes.
</Callout>
## Troubleshooting
- **`relation "..." does not exist`** — run `python scripts/db/init_postgres.py`
(or `alembic -c application/alembic.ini upgrade head`).
- **`FATAL: role "docsgpt" does not exist`** — run the `CREATE ROLE` /
`CREATE DATABASE` statements from step 2 of the fresh install as a
Postgres superuser.
- **`relation "..." does not exist`** — schema not applied. Either let
the app bootstrap it (`AUTO_MIGRATE=true`) or run
`python scripts/db/init_postgres.py`.
- **`permission denied to create database`** — the role lacks
`CREATEDB`. As superuser: `ALTER ROLE <name> CREATEDB;`. Or create
the database manually and set `AUTO_CREATE_DB=false`.
- **`role "docsgpt" does not exist`** — roles are never auto-created.
As superuser: `CREATE ROLE docsgpt LOGIN PASSWORD '...';`.
- **SSL errors on a managed provider** — append `?sslmode=require` to
`POSTGRES_URI`.
- **`ModuleNotFoundError: pymongo` when running `backfill.py`** —
install the Mongo client directly:
`pip install 'pymongo>=4.6'`.
- **`ModuleNotFoundError: pymongo`** — `pip install 'pymongo>=4.6'`
(only needed for the one-shot Mongo backfill).

View File

@@ -22,6 +22,21 @@ corresponding route handler is migrated to a repository read.
from __future__ import annotations
import os
# Disable the app's self-bootstrap (AUTO_CREATE_DB / AUTO_MIGRATE) before
# any ``application.*`` module is imported. ``application/app.py`` runs
# ``ensure_database_ready`` at import time using whatever ``POSTGRES_URI``
# is set in the environment — which in dev is the operator's local DB, not
# the ephemeral ``pytest-postgresql`` cluster that the fixtures below spin
# up. Tests manage their own schema via the ``pg_engine`` fixture
# (subprocess ``alembic upgrade head`` against the per-test URI), so the
# import-time bootstrap would at best be redundant and at worst would
# mutate the operator's dev DB. ``setdefault`` so a test run can still
# opt back in by setting the env var explicitly.
os.environ.setdefault("AUTO_MIGRATE", "false")
os.environ.setdefault("AUTO_CREATE_DB", "false")
import subprocess
import sys
from pathlib import Path