Skip to content

Flaky Tests API

The flaky tracking layer records test outcomes, computes flip-rates, and manages quarantine.


TestTracker

Records test execution runs into the SQLite database.

Usage::

tracker = TestTracker(store)
tracker.record_run("test_login", "passed", duration_ms=120.5)
tracker.record_run("test_login", "failed", error_type="AssertionError")
runs = tracker.get_runs("test_login")
Source code in breadcrumb/flaky/tracker.py
class TestTracker:
    """Records test execution runs into the SQLite database.

    Usage::

        tracker = TestTracker(store)
        tracker.record_run("test_login", "passed", duration_ms=120.5)
        tracker.record_run("test_login", "failed", error_type="AssertionError")
        runs = tracker.get_runs("test_login")
    """

    def __init__(self, store: FingerprintStore) -> None:
        self._store = store
        migrate_schema(store)

    def record_run(
        self,
        test_id: str,
        status: str,
        duration_ms: float | None = None,
        healing_occurred: bool = False,
        error_type: str | None = None,
        environment: str | None = None,
    ) -> None:
        """Record a test execution result.

        Args:
            test_id: Unique test identifier (e.g. pytest node ID).
            status: One of 'passed', 'failed', 'error', 'skipped'.
            duration_ms: Test duration in milliseconds.
            healing_occurred: Whether self-healing was triggered.
            error_type: Exception class name if the test errored.
            environment: Optional environment tag (e.g. 'ci', 'local').
        """
        conn = self._store._get_conn()
        conn.execute(
            "INSERT INTO test_runs "
            "(test_id, status, duration_ms, healing_occurred, error_type, environment, timestamp) "
            "VALUES (?, ?, ?, ?, ?, ?, ?)",
            (
                test_id,
                status,
                duration_ms,
                1 if healing_occurred else 0,
                error_type,
                environment,
                time.time(),
            ),
        )
        conn.commit()

    def get_runs(self, test_id: str, limit: int = 100) -> list[dict[str, Any]]:
        """Return recent runs for a test, ordered by timestamp descending."""
        conn = self._store._get_conn()
        rows = conn.execute(
            "SELECT id, test_id, status, duration_ms, healing_occurred, "
            "error_type, environment, timestamp "
            "FROM test_runs WHERE test_id = ? ORDER BY timestamp DESC LIMIT ?",
            (test_id, limit),
        ).fetchall()
        return [
            {
                "id": r[0],
                "test_id": r[1],
                "status": r[2],
                "duration_ms": r[3],
                "healing_occurred": bool(r[4]),
                "error_type": r[5],
                "environment": r[6],
                "timestamp": r[7],
            }
            for r in rows
        ]

    def get_all_test_ids(self) -> list[str]:
        """Return distinct test_ids that have at least one recorded run."""
        conn = self._store._get_conn()
        rows = conn.execute(
            "SELECT DISTINCT test_id FROM test_runs ORDER BY test_id",
        ).fetchall()
        return [r[0] for r in rows]
record_run(test_id: str, status: str, duration_ms: float | None = None, healing_occurred: bool = False, error_type: str | None = None, environment: str | None = None) -> None

Record a test execution result.

Parameters:

Name Type Description Default
test_id str

Unique test identifier (e.g. pytest node ID).

required
status str

One of 'passed', 'failed', 'error', 'skipped'.

required
duration_ms float | None

Test duration in milliseconds.

None
healing_occurred bool

Whether self-healing was triggered.

False
error_type str | None

Exception class name if the test errored.

None
environment str | None

Optional environment tag (e.g. 'ci', 'local').

None
Source code in breadcrumb/flaky/tracker.py
def record_run(
    self,
    test_id: str,
    status: str,
    duration_ms: float | None = None,
    healing_occurred: bool = False,
    error_type: str | None = None,
    environment: str | None = None,
) -> None:
    """Record a test execution result.

    Args:
        test_id: Unique test identifier (e.g. pytest node ID).
        status: One of 'passed', 'failed', 'error', 'skipped'.
        duration_ms: Test duration in milliseconds.
        healing_occurred: Whether self-healing was triggered.
        error_type: Exception class name if the test errored.
        environment: Optional environment tag (e.g. 'ci', 'local').
    """
    conn = self._store._get_conn()
    conn.execute(
        "INSERT INTO test_runs "
        "(test_id, status, duration_ms, healing_occurred, error_type, environment, timestamp) "
        "VALUES (?, ?, ?, ?, ?, ?, ?)",
        (
            test_id,
            status,
            duration_ms,
            1 if healing_occurred else 0,
            error_type,
            environment,
            time.time(),
        ),
    )
    conn.commit()
get_runs(test_id: str, limit: int = 100) -> list[dict[str, Any]]

Return recent runs for a test, ordered by timestamp descending.

Source code in breadcrumb/flaky/tracker.py
def get_runs(self, test_id: str, limit: int = 100) -> list[dict[str, Any]]:
    """Return recent runs for a test, ordered by timestamp descending."""
    conn = self._store._get_conn()
    rows = conn.execute(
        "SELECT id, test_id, status, duration_ms, healing_occurred, "
        "error_type, environment, timestamp "
        "FROM test_runs WHERE test_id = ? ORDER BY timestamp DESC LIMIT ?",
        (test_id, limit),
    ).fetchall()
    return [
        {
            "id": r[0],
            "test_id": r[1],
            "status": r[2],
            "duration_ms": r[3],
            "healing_occurred": bool(r[4]),
            "error_type": r[5],
            "environment": r[6],
            "timestamp": r[7],
        }
        for r in rows
    ]
get_all_test_ids() -> list[str]

Return distinct test_ids that have at least one recorded run.

Source code in breadcrumb/flaky/tracker.py
def get_all_test_ids(self) -> list[str]:
    """Return distinct test_ids that have at least one recorded run."""
    conn = self._store._get_conn()
    rows = conn.execute(
        "SELECT DISTINCT test_id FROM test_runs ORDER BY test_id",
    ).fetchall()
    return [r[0] for r in rows]

migrate_schema

migrate_schema(store: FingerprintStore) -> None

Migrate a v1 DB to v2 by adding test_runs and quarantine tables.

Safe to call multiple times — CREATE IF NOT EXISTS guards are used.

Source code in breadcrumb/flaky/tracker.py
def migrate_schema(store: FingerprintStore) -> None:
    """Migrate a v1 DB to v2 by adding test_runs and quarantine tables.

    Safe to call multiple times — CREATE IF NOT EXISTS guards are used.
    """
    conn = store._get_conn()
    conn.executescript(_V2_TABLES_SQL)

    row = conn.execute(
        "SELECT value FROM schema_meta WHERE key = 'schema_version'",
    ).fetchone()
    current_version = int(row[0]) if row else 1

    if current_version < 2:
        conn.execute(
            "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('schema_version', '2')",
        )
        conn.commit()

TestAnalyzer

Analyses test run history to detect and rank flaky tests.

Classifications

Stable — fliprate == 0.0 Intermittent — 0.0 < fliprate <= 0.2 Flaky — 0.2 < fliprate <= 0.5 Chronic — fliprate > 0.5

Usage::

analyzer = TestAnalyzer(tracker)
fliprate = analyzer.compute_fliprate("test_login")
classification = analyzer.classify("test_login")
Source code in breadcrumb/flaky/analyzer.py
class TestAnalyzer:
    """Analyses test run history to detect and rank flaky tests.

    Classifications:
        Stable      — fliprate == 0.0
        Intermittent — 0.0 < fliprate <= 0.2
        Flaky        — 0.2 < fliprate <= 0.5
        Chronic      — fliprate > 0.5

    Usage::

        analyzer = TestAnalyzer(tracker)
        fliprate = analyzer.compute_fliprate("test_login")
        classification = analyzer.classify("test_login")
    """

    def __init__(self, tracker: TestTracker) -> None:
        self._tracker = tracker

    def compute_fliprate(self, test_id: str, window: int = 10) -> float:
        """Standard flip-rate: fraction of consecutive outcome changes.

        For n runs, there are n-1 consecutive pairs. Each pair where the
        outcome changes counts as a flip. Returns flips / (n-1).

        Returns 0.0 if fewer than 2 runs are available.
        """
        runs = self._tracker.get_runs(test_id, limit=window)
        if len(runs) < 2:
            return 0.0

        statuses = [r["status"] for r in runs]
        flips = sum(1 for a, b in itertools.pairwise(statuses) if a != b)
        return flips / (len(statuses) - 1)

    def compute_ewma_fliprate(
        self,
        test_id: str,
        alpha: float = 0.3,
        window: int = 20,
    ) -> float:
        """EWMA-weighted flip-rate (more recent flips weighted more heavily).

        Implements the exponentially weighted moving average approach from
        Apple's "Modeling and ranking flaky tests" paper. Alpha controls
        how much weight is given to recent vs older flips (higher = more
        recent-biased).

        Returns 0.0 if fewer than 2 runs are available.
        """
        runs = self._tracker.get_runs(test_id, limit=window)
        if len(runs) < 2:
            return 0.0

        statuses = [r["status"] for r in runs]
        # Pairs ordered oldest-to-newest (runs are DESC, so reverse)
        statuses = list(reversed(statuses))
        pairs = [a != b for a, b in itertools.pairwise(statuses)]

        # EWMA over the flip indicators
        ewma = float(pairs[0])
        for flip in pairs[1:]:
            ewma = alpha * float(flip) + (1 - alpha) * ewma
        return ewma

    def classify(self, test_id: str) -> str:
        """Classify a test by its flip-rate into a stability tier.

        Returns one of: 'Stable', 'Intermittent', 'Flaky', 'Chronic'.
        """
        rate = self.compute_fliprate(test_id)
        if rate == 0.0:
            return "Stable"
        if rate <= 0.2:
            return "Intermittent"
        if rate <= 0.5:
            return "Flaky"
        return "Chronic"

    def get_all_classifications(self) -> dict[str, str]:
        """Return {test_id: classification} for all known tests."""
        test_ids = self._tracker.get_all_test_ids()
        return {tid: self.classify(tid) for tid in test_ids}
compute_fliprate(test_id: str, window: int = 10) -> float

Standard flip-rate: fraction of consecutive outcome changes.

For n runs, there are n-1 consecutive pairs. Each pair where the outcome changes counts as a flip. Returns flips / (n-1).

Returns 0.0 if fewer than 2 runs are available.

Source code in breadcrumb/flaky/analyzer.py
def compute_fliprate(self, test_id: str, window: int = 10) -> float:
    """Standard flip-rate: fraction of consecutive outcome changes.

    For n runs, there are n-1 consecutive pairs. Each pair where the
    outcome changes counts as a flip. Returns flips / (n-1).

    Returns 0.0 if fewer than 2 runs are available.
    """
    runs = self._tracker.get_runs(test_id, limit=window)
    if len(runs) < 2:
        return 0.0

    statuses = [r["status"] for r in runs]
    flips = sum(1 for a, b in itertools.pairwise(statuses) if a != b)
    return flips / (len(statuses) - 1)
compute_ewma_fliprate(test_id: str, alpha: float = 0.3, window: int = 20) -> float

EWMA-weighted flip-rate (more recent flips weighted more heavily).

Implements the exponentially weighted moving average approach from Apple's "Modeling and ranking flaky tests" paper. Alpha controls how much weight is given to recent vs older flips (higher = more recent-biased).

Returns 0.0 if fewer than 2 runs are available.

Source code in breadcrumb/flaky/analyzer.py
def compute_ewma_fliprate(
    self,
    test_id: str,
    alpha: float = 0.3,
    window: int = 20,
) -> float:
    """EWMA-weighted flip-rate (more recent flips weighted more heavily).

    Implements the exponentially weighted moving average approach from
    Apple's "Modeling and ranking flaky tests" paper. Alpha controls
    how much weight is given to recent vs older flips (higher = more
    recent-biased).

    Returns 0.0 if fewer than 2 runs are available.
    """
    runs = self._tracker.get_runs(test_id, limit=window)
    if len(runs) < 2:
        return 0.0

    statuses = [r["status"] for r in runs]
    # Pairs ordered oldest-to-newest (runs are DESC, so reverse)
    statuses = list(reversed(statuses))
    pairs = [a != b for a, b in itertools.pairwise(statuses)]

    # EWMA over the flip indicators
    ewma = float(pairs[0])
    for flip in pairs[1:]:
        ewma = alpha * float(flip) + (1 - alpha) * ewma
    return ewma
classify(test_id: str) -> str

Classify a test by its flip-rate into a stability tier.

Returns one of: 'Stable', 'Intermittent', 'Flaky', 'Chronic'.

Source code in breadcrumb/flaky/analyzer.py
def classify(self, test_id: str) -> str:
    """Classify a test by its flip-rate into a stability tier.

    Returns one of: 'Stable', 'Intermittent', 'Flaky', 'Chronic'.
    """
    rate = self.compute_fliprate(test_id)
    if rate == 0.0:
        return "Stable"
    if rate <= 0.2:
        return "Intermittent"
    if rate <= 0.5:
        return "Flaky"
    return "Chronic"
get_all_classifications() -> dict[str, str]

Return {test_id: classification} for all known tests.

Source code in breadcrumb/flaky/analyzer.py
def get_all_classifications(self) -> dict[str, str]:
    """Return {test_id: classification} for all known tests."""
    test_ids = self._tracker.get_all_test_ids()
    return {tid: self.classify(tid) for tid in test_ids}

QuarantineManager

Manages the quarantine list for flaky tests.

Quarantined tests
  • Are still executed so data keeps accumulating.
  • Their failures should not block CI (callers are responsible for enforcing this; the manager only tracks quarantine state).
  • Are automatically released when their classification improves to Stable or Intermittent.

Usage::

manager = QuarantineManager(store, analyzer)
manager.quarantine("test_checkout", "auto: Chronic fliprate 0.7")
if manager.is_quarantined("test_checkout"):
    ...
report = manager.auto_update()
Source code in breadcrumb/flaky/quarantine.py
class QuarantineManager:
    """Manages the quarantine list for flaky tests.

    Quarantined tests:
        - Are still executed so data keeps accumulating.
        - Their failures should not block CI (callers are responsible for
          enforcing this; the manager only tracks quarantine state).
        - Are automatically released when their classification improves to
          Stable or Intermittent.

    Usage::

        manager = QuarantineManager(store, analyzer)
        manager.quarantine("test_checkout", "auto: Chronic fliprate 0.7")
        if manager.is_quarantined("test_checkout"):
            ...
        report = manager.auto_update()
    """

    def __init__(
        self,
        store: FingerprintStore,
        analyzer: TestAnalyzer,
        fliprate_threshold: float = 0.3,
    ) -> None:
        self._store = store
        self._analyzer = analyzer
        self._threshold = fliprate_threshold

    def is_quarantined(self, test_id: str) -> bool:
        """Return True if the test is currently quarantined."""
        conn = self._store._get_conn()
        row = conn.execute(
            "SELECT 1 FROM quarantine WHERE test_id = ?",
            (test_id,),
        ).fetchone()
        return row is not None

    def quarantine(self, test_id: str, reason: str) -> None:
        """Add a test to the quarantine list."""
        conn = self._store._get_conn()
        conn.execute(
            "INSERT OR REPLACE INTO quarantine (test_id, reason, quarantined_at, auto_unquarantine) "
            "VALUES (?, ?, ?, 1)",
            (test_id, reason, time.time()),
        )
        conn.commit()

    def unquarantine(self, test_id: str) -> None:
        """Remove a test from the quarantine list."""
        conn = self._store._get_conn()
        conn.execute("DELETE FROM quarantine WHERE test_id = ?", (test_id,))
        conn.commit()

    def auto_update(self) -> dict[str, list[str]]:
        """Auto-quarantine Flaky/Chronic tests; release Stable/Intermittent ones.

        Only tests with auto_unquarantine=1 are candidates for automatic release.

        Returns:
            {"quarantined": [list of newly quarantined test_ids],
             "unquarantined": [list of released test_ids]}
        """
        classifications = self._analyzer.get_all_classifications()
        newly_quarantined: list[str] = []
        newly_unquarantined: list[str] = []

        for test_id, classification in classifications.items():
            currently = self.is_quarantined(test_id)

            if classification in ("Flaky", "Chronic") and not currently:
                reason = f"auto: {classification} (fliprate threshold {self._threshold:.2f})"
                self.quarantine(test_id, reason)
                newly_quarantined.append(test_id)

            elif classification in ("Stable", "Intermittent") and currently:
                # Only auto-release if marked for auto-unquarantine
                conn = self._store._get_conn()
                row = conn.execute(
                    "SELECT auto_unquarantine FROM quarantine WHERE test_id = ?",
                    (test_id,),
                ).fetchone()
                if row and row[0]:
                    self.unquarantine(test_id)
                    newly_unquarantined.append(test_id)

        return {"quarantined": newly_quarantined, "unquarantined": newly_unquarantined}

    def get_all_quarantined(self) -> list[str]:
        """Return the list of all currently quarantined test_ids."""
        conn = self._store._get_conn()
        rows = conn.execute(
            "SELECT test_id FROM quarantine ORDER BY quarantined_at DESC",
        ).fetchall()
        return [r[0] for r in rows]
is_quarantined(test_id: str) -> bool

Return True if the test is currently quarantined.

Source code in breadcrumb/flaky/quarantine.py
def is_quarantined(self, test_id: str) -> bool:
    """Return True if the test is currently quarantined."""
    conn = self._store._get_conn()
    row = conn.execute(
        "SELECT 1 FROM quarantine WHERE test_id = ?",
        (test_id,),
    ).fetchone()
    return row is not None
quarantine(test_id: str, reason: str) -> None

Add a test to the quarantine list.

Source code in breadcrumb/flaky/quarantine.py
def quarantine(self, test_id: str, reason: str) -> None:
    """Add a test to the quarantine list."""
    conn = self._store._get_conn()
    conn.execute(
        "INSERT OR REPLACE INTO quarantine (test_id, reason, quarantined_at, auto_unquarantine) "
        "VALUES (?, ?, ?, 1)",
        (test_id, reason, time.time()),
    )
    conn.commit()
unquarantine(test_id: str) -> None

Remove a test from the quarantine list.

Source code in breadcrumb/flaky/quarantine.py
def unquarantine(self, test_id: str) -> None:
    """Remove a test from the quarantine list."""
    conn = self._store._get_conn()
    conn.execute("DELETE FROM quarantine WHERE test_id = ?", (test_id,))
    conn.commit()
auto_update() -> dict[str, list[str]]

Auto-quarantine Flaky/Chronic tests; release Stable/Intermittent ones.

Only tests with auto_unquarantine=1 are candidates for automatic release.

Returns:

Type Description
dict[str, list[str]]

{"quarantined": [list of newly quarantined test_ids], "unquarantined": [list of released test_ids]}

Source code in breadcrumb/flaky/quarantine.py
def auto_update(self) -> dict[str, list[str]]:
    """Auto-quarantine Flaky/Chronic tests; release Stable/Intermittent ones.

    Only tests with auto_unquarantine=1 are candidates for automatic release.

    Returns:
        {"quarantined": [list of newly quarantined test_ids],
         "unquarantined": [list of released test_ids]}
    """
    classifications = self._analyzer.get_all_classifications()
    newly_quarantined: list[str] = []
    newly_unquarantined: list[str] = []

    for test_id, classification in classifications.items():
        currently = self.is_quarantined(test_id)

        if classification in ("Flaky", "Chronic") and not currently:
            reason = f"auto: {classification} (fliprate threshold {self._threshold:.2f})"
            self.quarantine(test_id, reason)
            newly_quarantined.append(test_id)

        elif classification in ("Stable", "Intermittent") and currently:
            # Only auto-release if marked for auto-unquarantine
            conn = self._store._get_conn()
            row = conn.execute(
                "SELECT auto_unquarantine FROM quarantine WHERE test_id = ?",
                (test_id,),
            ).fetchone()
            if row and row[0]:
                self.unquarantine(test_id)
                newly_unquarantined.append(test_id)

    return {"quarantined": newly_quarantined, "unquarantined": newly_unquarantined}
get_all_quarantined() -> list[str]

Return the list of all currently quarantined test_ids.

Source code in breadcrumb/flaky/quarantine.py
def get_all_quarantined(self) -> list[str]:
    """Return the list of all currently quarantined test_ids."""
    conn = self._store._get_conn()
    rows = conn.execute(
        "SELECT test_id FROM quarantine ORDER BY quarantined_at DESC",
    ).fetchall()
    return [r[0] for r in rows]