Skip to content

People

transcript_indexer.people

Speaker disambiguation: people, person_aliases, and the resolver.

turns.speaker, participants.speaker, entities.value (when type='person'), and action_items.owner always store the raw source string. A nullable person_id FK on each row is populated by the deterministic post-pass below via case-insensitive lookup in person_aliases. NULL means "unresolved" — the row is still queryable by raw string.

Alias curation is semi-manual via the txi people subcommand surface in cli.py. Slack readers (Phase 8) auto-populate aliases from S_USER.

add_person(conn, canonical, aliases)

Insert a person and (canonical + given aliases) into person_aliases.

The canonical name is itself added as an alias so a raw canonical string resolves without an explicit alias entry.

Source code in src/transcript_indexer/people.py
def add_person(conn: sqlite3.Connection, canonical: str, aliases: list[str]) -> int:
    """Insert a person and (canonical + given aliases) into person_aliases.

    The canonical name is itself added as an alias so a raw canonical string
    resolves without an explicit alias entry.
    """
    with conn:
        cur = conn.execute("INSERT INTO people(canonical) VALUES(?)", (canonical,))
        person_id = int(cur.lastrowid or 0)
        all_aliases = {canonical, *aliases}
        conn.executemany(
            "INSERT INTO person_aliases(alias, person_id) VALUES(?,?)",
            [(a, person_id) for a in all_aliases],
        )
    return person_id

merge_people(conn, keep_id, drop_id)

Move all FKs from drop_id to keep_id, delete drop_id's row.

Aliases of drop_id are reassigned. The drop row is then removed; its person_aliases cascade with it (but we already moved them).

Source code in src/transcript_indexer/people.py
def merge_people(conn: sqlite3.Connection, keep_id: int, drop_id: int) -> None:
    """Move all FKs from drop_id to keep_id, delete drop_id's row.

    Aliases of drop_id are reassigned. The drop row is then removed; its
    person_aliases cascade with it (but we already moved them).
    """
    if keep_id == drop_id:
        return
    with conn:
        conn.execute(
            "UPDATE person_aliases SET person_id = ? WHERE person_id = ?",
            (keep_id, drop_id),
        )
        conn.execute(
            "UPDATE turns SET person_id = ? WHERE person_id = ?",
            (keep_id, drop_id),
        )
        conn.execute(
            "UPDATE participants SET person_id = ? WHERE person_id = ?",
            (keep_id, drop_id),
        )
        conn.execute(
            "UPDATE entities SET person_id = ? WHERE person_id = ?",
            (keep_id, drop_id),
        )
        conn.execute(
            "UPDATE action_items SET owner_person_id = ? WHERE owner_person_id = ?",
            (keep_id, drop_id),
        )
        conn.execute("DELETE FROM people WHERE id = ?", (drop_id,))

list_people(conn)

Return [(id, canonical, [aliases])].

Source code in src/transcript_indexer/people.py
def list_people(conn: sqlite3.Connection) -> list[tuple[int, str, list[str]]]:
    """Return [(id, canonical, [aliases])]."""
    by_id: dict[int, list[str]] = {}
    for row in conn.execute(
        "SELECT person_id, alias FROM person_aliases ORDER BY person_id, alias"
    ):
        by_id.setdefault(int(row["person_id"]), []).append(str(row["alias"]))
    out: list[tuple[int, str, list[str]]] = []
    for row in conn.execute("SELECT id, canonical FROM people ORDER BY canonical"):
        pid = int(row["id"])
        out.append((pid, str(row["canonical"]), by_id.get(pid, [])))
    return out

resolve_people(conn)

Populate person_id columns by joining raw values to person_aliases.

Idempotent: rows already resolved keep their value; rows whose raw string no longer matches any alias are NOT cleared, since extraction may have intentionally produced a value that is canonical-but-not-yet-aliased.

Source code in src/transcript_indexer/people.py
def resolve_people(conn: sqlite3.Connection) -> None:
    """Populate `person_id` columns by joining raw values to `person_aliases`.

    Idempotent: rows already resolved keep their value; rows whose raw string
    no longer matches any alias are NOT cleared, since extraction may have
    intentionally produced a value that is canonical-but-not-yet-aliased.
    """
    with conn:
        # turns
        conn.execute(
            "UPDATE turns SET person_id = ("
            "  SELECT person_id FROM person_aliases WHERE alias = turns.speaker"
            ") WHERE person_id IS NULL"
        )
        # participants
        conn.execute(
            "UPDATE participants SET person_id = ("
            "  SELECT person_id FROM person_aliases WHERE alias = participants.speaker"
            ") WHERE person_id IS NULL"
        )
        # entities (type='person' only)
        conn.execute(
            "UPDATE entities SET person_id = ("
            "  SELECT person_id FROM person_aliases WHERE alias = entities.value"
            ") WHERE person_id IS NULL AND type = 'person'"
        )
        # action_items.owner_person_id
        conn.execute(
            "UPDATE action_items SET owner_person_id = ("
            "  SELECT person_id FROM person_aliases WHERE alias = action_items.owner"
            ") WHERE owner_person_id IS NULL AND owner IS NOT NULL"
        )

unknown_values(conn)

List all distinct raw values that have no person_id, with counts.

Source code in src/transcript_indexer/people.py
def unknown_values(conn: sqlite3.Connection) -> list[UnknownValue]:
    """List all distinct raw values that have no person_id, with counts."""
    out: list[UnknownValue] = []
    for row in conn.execute(
        "SELECT speaker, COUNT(*) AS n FROM turns "
        "WHERE person_id IS NULL GROUP BY speaker ORDER BY n DESC, speaker"
    ):
        out.append(UnknownValue("speaker", str(row["speaker"]), int(row["n"])))
    for row in conn.execute(
        "SELECT value, COUNT(*) AS n FROM entities "
        "WHERE type = 'person' AND person_id IS NULL "
        "GROUP BY value ORDER BY n DESC, value"
    ):
        out.append(UnknownValue("entity_person", str(row["value"]), int(row["n"])))
    for row in conn.execute(
        "SELECT owner, COUNT(*) AS n FROM action_items "
        "WHERE owner IS NOT NULL AND owner_person_id IS NULL "
        "GROUP BY owner ORDER BY n DESC, owner"
    ):
        out.append(UnknownValue("owner", str(row["owner"]), int(row["n"])))
    return out