Speaker disambiguation: people, person_aliases, and the resolver.
turns.speaker, participants.speaker, entities.value (when type='person'),
and action_items.owner always store the raw source string. A nullable
person_id FK on each row is populated by the deterministic post-pass below
via case-insensitive lookup in person_aliases. NULL means "unresolved" —
the row is still queryable by raw string.
Alias curation is semi-manual via the txi people subcommand surface in
cli.py. Slack readers (Phase 8) auto-populate aliases from S_USER.
add_person(conn, canonical, aliases)
Insert a person and (canonical + given aliases) into person_aliases.
The canonical name is itself added as an alias so a raw canonical string
resolves without an explicit alias entry.
Source code in src/transcript_indexer/people.py
| def add_person(conn: sqlite3.Connection, canonical: str, aliases: list[str]) -> int:
"""Insert a person and (canonical + given aliases) into person_aliases.
The canonical name is itself added as an alias so a raw canonical string
resolves without an explicit alias entry.
"""
with conn:
cur = conn.execute("INSERT INTO people(canonical) VALUES(?)", (canonical,))
person_id = int(cur.lastrowid or 0)
all_aliases = {canonical, *aliases}
conn.executemany(
"INSERT INTO person_aliases(alias, person_id) VALUES(?,?)",
[(a, person_id) for a in all_aliases],
)
return person_id
|
merge_people(conn, keep_id, drop_id)
Move all FKs from drop_id to keep_id, delete drop_id's row.
Aliases of drop_id are reassigned. The drop row is then removed; its
person_aliases cascade with it (but we already moved them).
Source code in src/transcript_indexer/people.py
| def merge_people(conn: sqlite3.Connection, keep_id: int, drop_id: int) -> None:
"""Move all FKs from drop_id to keep_id, delete drop_id's row.
Aliases of drop_id are reassigned. The drop row is then removed; its
person_aliases cascade with it (but we already moved them).
"""
if keep_id == drop_id:
return
with conn:
conn.execute(
"UPDATE person_aliases SET person_id = ? WHERE person_id = ?",
(keep_id, drop_id),
)
conn.execute(
"UPDATE turns SET person_id = ? WHERE person_id = ?",
(keep_id, drop_id),
)
conn.execute(
"UPDATE participants SET person_id = ? WHERE person_id = ?",
(keep_id, drop_id),
)
conn.execute(
"UPDATE entities SET person_id = ? WHERE person_id = ?",
(keep_id, drop_id),
)
conn.execute(
"UPDATE action_items SET owner_person_id = ? WHERE owner_person_id = ?",
(keep_id, drop_id),
)
conn.execute("DELETE FROM people WHERE id = ?", (drop_id,))
|
list_people(conn)
Return [(id, canonical, [aliases])].
Source code in src/transcript_indexer/people.py
| def list_people(conn: sqlite3.Connection) -> list[tuple[int, str, list[str]]]:
"""Return [(id, canonical, [aliases])]."""
by_id: dict[int, list[str]] = {}
for row in conn.execute(
"SELECT person_id, alias FROM person_aliases ORDER BY person_id, alias"
):
by_id.setdefault(int(row["person_id"]), []).append(str(row["alias"]))
out: list[tuple[int, str, list[str]]] = []
for row in conn.execute("SELECT id, canonical FROM people ORDER BY canonical"):
pid = int(row["id"])
out.append((pid, str(row["canonical"]), by_id.get(pid, [])))
return out
|
resolve_people(conn)
Populate person_id columns by joining raw values to person_aliases.
Idempotent: rows already resolved keep their value; rows whose raw string
no longer matches any alias are NOT cleared, since extraction may have
intentionally produced a value that is canonical-but-not-yet-aliased.
Source code in src/transcript_indexer/people.py
| def resolve_people(conn: sqlite3.Connection) -> None:
"""Populate `person_id` columns by joining raw values to `person_aliases`.
Idempotent: rows already resolved keep their value; rows whose raw string
no longer matches any alias are NOT cleared, since extraction may have
intentionally produced a value that is canonical-but-not-yet-aliased.
"""
with conn:
# turns
conn.execute(
"UPDATE turns SET person_id = ("
" SELECT person_id FROM person_aliases WHERE alias = turns.speaker"
") WHERE person_id IS NULL"
)
# participants
conn.execute(
"UPDATE participants SET person_id = ("
" SELECT person_id FROM person_aliases WHERE alias = participants.speaker"
") WHERE person_id IS NULL"
)
# entities (type='person' only)
conn.execute(
"UPDATE entities SET person_id = ("
" SELECT person_id FROM person_aliases WHERE alias = entities.value"
") WHERE person_id IS NULL AND type = 'person'"
)
# action_items.owner_person_id
conn.execute(
"UPDATE action_items SET owner_person_id = ("
" SELECT person_id FROM person_aliases WHERE alias = action_items.owner"
") WHERE owner_person_id IS NULL AND owner IS NOT NULL"
)
|
unknown_values(conn)
List all distinct raw values that have no person_id, with counts.
Source code in src/transcript_indexer/people.py
| def unknown_values(conn: sqlite3.Connection) -> list[UnknownValue]:
"""List all distinct raw values that have no person_id, with counts."""
out: list[UnknownValue] = []
for row in conn.execute(
"SELECT speaker, COUNT(*) AS n FROM turns "
"WHERE person_id IS NULL GROUP BY speaker ORDER BY n DESC, speaker"
):
out.append(UnknownValue("speaker", str(row["speaker"]), int(row["n"])))
for row in conn.execute(
"SELECT value, COUNT(*) AS n FROM entities "
"WHERE type = 'person' AND person_id IS NULL "
"GROUP BY value ORDER BY n DESC, value"
):
out.append(UnknownValue("entity_person", str(row["value"]), int(row["n"])))
for row in conn.execute(
"SELECT owner, COUNT(*) AS n FROM action_items "
"WHERE owner IS NOT NULL AND owner_person_id IS NULL "
"GROUP BY owner ORDER BY n DESC, owner"
):
out.append(UnknownValue("owner", str(row["owner"]), int(row["n"])))
return out
|