From 09db2e694bb0628b8589fb8f282f981d558fa127 Mon Sep 17 00:00:00 2001 From: Codex Date: Tue, 2 Jun 2026 18:04:33 +0200 Subject: [PATCH] =?UTF-8?q?PINK:=20E2E=20trace=20analysis=20=E2=80=94=20Pa?= =?UTF-8?q?ss=2021=20rust=20build/deps/python=20packaging/shared=20mem=20(?= =?UTF-8?q?X1-X14)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Twenty-first pass: no ABI compatibility check on Rust .so load stale binary corrupts silently (X1 Critical), real_zinc_plane _write_region zeroes entire buffer before write visible all-zero window (X2 Critical), no requirements.txt setup.py pyproject.toml zero Python dependency declarations (X3 Critical), RealZincControlPlane.update() no thread lock concurrent calls corrupt seq and shared memory (X4 High), libc declared in Cargo.toml never used dead dependency (X5 High), 5 test files hardcoded sys.path.insert non-portable (X6 High), _decode_packet no try/except on json.loads partial body read crashes reader (X7 High), ExchangeEvent not exported from __init__.py package API inconsistency (X8 High), RealZincPlane and RealZincControlPlane collide on {prefix}_control region name (X10 Medium). 375 total flaws across 21 passes. Co-authored-by: CommandCodeBot --- PINK_DITAv2_E2E_TRACE_ANALYSIS.md | 310 ++++++++++++++++++ PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md | 42 ++- .../PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md | 72 +++- .../dita_v2/_rust_kernel/src/lib.rs | 4 + prod/clean_arch/dita_v2/bingx_venue.py | 36 +- prod/clean_arch/dita_v2/launcher.py | 2 + prod/clean_arch/dita_v2/test_flaws.py | 134 ++++++++ 7 files changed, 589 insertions(+), 11 deletions(-) diff --git a/PINK_DITAv2_E2E_TRACE_ANALYSIS.md b/PINK_DITAv2_E2E_TRACE_ANALYSIS.md index 03bb5d6..b0774bf 100644 --- a/PINK_DITAv2_E2E_TRACE_ANALYSIS.md +++ b/PINK_DITAv2_E2E_TRACE_ANALYSIS.md @@ -7479,3 +7479,313 @@ If the operator sets `DITA_V2_ACTIVE_SLOT_LIMIT=abc`, the `int()` raises `ValueE | V | Pass 19 (Lifecycle/Rust Subtleties/Test Infra) | 14 | 5 | 2 | 4 | 3 | 0 | | W | Pass 20 (Config/Math Signs/BingX Protocol) | 14 | 4 | 7 | 3 | 0 | 0 | | **Total** | | **361** | **39** | **108** | **103** | **64** | **37** | + +--- + +## PASS 21 — RUST BUILD/DEPS, PYTHON PACKAGING, SHARED MEMORY PROTOCOL + +### X1: Critical — No ABI compatibility check on Rust `.so` load — stale/wrong binary can crash or silently corrupt state + +**File:** `rust_backend.py:86-92` + +```python +path = _ensure_library() +self.lib = ctypes.CDLL(str(path)) +``` + +The Python code loads whatever `.so`/`.dylib` exists at the computed path with **zero verification**. Problems: + +1. **No Rust version check**: If the `.so` was built with a different Rust compiler version that changed struct layout, data is silently corrupted. +2. **No recompile-on-version-mismatch**: If `Cargo.lock` is updated, the old `.so` is used until manually deleted. +3. **No hash/checksum**: No mechanism to detect stale binary, wrong branch, or tampering. +4. **No `#[repr(C)]` on internal types**: Only `KernelHandle` has `#[repr(C)]`. Serde JSON is the FFI wire format, which is type-safe, but the `Box::from_raw(handle)` in `dita_kernel_destroy` assumes exact same memory layout. + +**Severity: Critical** + +### X2: Critical — `real_zinc_plane._write_region()` zeroes entire buffer before writing — visible all-zero window, inconsistent with real_control_plane + +**File:** `real_zinc_plane.py:258-260` + +```python +# real_zinc_plane.py — zero THEN write: +view[:] = b"\x00" * len(view) # Zero entire 1MB buffer +view[:len(packet)] = packet # Then write packet +``` + +```python +# real_control_plane.py — write THEN zero tail: +view[:len(packet)] = packet # Write packet first +view[len(packet):] = b"\x00" * (len(view) - len(packet)) # Then zero tail +``` + +Two different implementations for the same operation. The zinc plane zeros the full buffer (1MB allocation and memcpy) **before** writing the packet. During the window between zero and write, a concurrent reader sees all zeros → `_decode_packet` returns `{}` (empty dict). Reader gets stale/wrong state. + +The control plane correctly writes the packet first then zeros the tail — minimizing the visible window and avoiding the wasteful full-buffer zero. + +Additionally, the full-buffer zero `b"\x00" * 1MB` allocates and copies 1MB for every write, even though the packet is typically <1KB. Performance issue. + +**Severity: Critical** + +### X3: Critical — No `requirements.txt`, `setup.py`, or `pyproject.toml` — zero Python dependency declarations + +**File:** (missing — workspace root) + +The workspace has **no Python dependency management files at all**. No `requirements.txt`, `setup.py`, `setup.cfg`, `pyproject.toml`, `Pipfile`, or `poetry.lock`. + +**Undocumented external dependencies:** +- `aiohttp` — used by `bingx_user_stream.py` +- `requests` — used by `gen_live_tests.py` +- `python-dotenv` — used by `launcher.py` +- `pytest` — used by all test files and generators +- `zinc` (SharedRegion C extension) — used by `real_zinc_plane.py`, `real_control_plane.py` +- `prod.bingx.*` — 3+ modules outside workspace +- `prod.clean_arch.*` — 5+ modules outside workspace + +Without a requirements file: +- No pinned versions → build non-reproducible +- `pip install` on a fresh machine installs only what happens to be present +- Version conflicts between environments cause silent behavior changes +- CI cannot install dependencies deterministically + +**Severity: Critical** + +### X4: High — `RealZincControlPlane.update()` has no thread lock — concurrent calls corrupt sequence number and shared memory + +**File:** `real_control_plane.py:98-99` + +```python +# No lock on RealZincControlPlane (unlike RealZincPlane which has self._lock) +def update(self) -> None: + self._seq += 1 # race: two threads read 5, both write 6 + self._write_region(self._seq, self._snapshot.as_dict()) # race: both write seq=6 +``` + +`RealZincPlane` (real_zinc_plane.py:154) has a `threading.Lock` and uses `with self._lock:` around all write operations. `RealZincControlPlane` has **no lock**. If two threads call `update()` simultaneously: + +1. Both read `self._seq = 5`, both increment to `6`, both write with `seq=6` → one write is lost +2. Both call `_write_region` simultaneously → concurrent writes to shared memory → data corruption +3. Sequence number jumps: two calls, sequence goes `5→6` with only one write visible + +**Severity: High** + +### X5: High — `libc` declared in `Cargo.toml` but never used — dead dependency + +**File:** `_rust_kernel/Cargo.toml:8` + +```toml +[dependencies] +libc = "0.2" +``` + +The `libc` crate is declared as a dependency but `grep 'libc' src/lib.rs` returns **zero matches**. The code uses `std::ffi::{c_char, CStr, CString}` from the standard library (stable since Rust 1.64), not `libc::c_char`. + +Not harmful at runtime (compiler optimizes it out), but: +- Dead dependency to maintain (version bumps, audit) +- Adds to supply chain attack surface +- Indicates refactoring residue from an earlier version that used `libc` types directly + +**Severity: High** + +### X6: High — 5 test files use hardcoded `sys.path.insert(0, "/mnt/dolphinng5_predict")` — non-portable, environment-specific path + +**Files:** `test_flaws.py:13`, `test_account_core_v2.py:16`, `test_account_reconcile_faults.py:15`, `test_alpha_blue_untouched_g7.py:13`, `test_exchange_event_seam_parity.py` (similar) + +Every mock-venue test file prepends `/mnt/dolphinng5_predict` to `sys.path` using a **hardcoded absolute path**. This path is specific to the current deployment machine. On any other machine, these tests fail with `ModuleNotFoundError` for `prod.*` imports. + +The `real_zinc_plane.py:13-14` also adds a Zinc adapter path using `Path(__file__).resolve().parents[3]` which is relative (better) but assumes a rigid directory structure. + +**Severity: High** + +### X7: High — Shared memory `_decode_packet()` has no try/except on `json.loads` — partial body read causes unhandled `JSONDecodeError`, crashes reader + +**File:** `real_zinc_plane.py:120-130`, `real_control_plane.py:53-63` + +```python +def _decode_packet(buf: memoryview) -> Dict[str, Any]: + if len(buf) < 16: return {} + seq, size = struct.unpack_from("!QQ", buf, 0) + if size <= 0 or size > len(buf) - 16: return {} + payload = bytes(buf[16 : 16 + size]).decode("utf-8") + out = json.loads(payload) # NO try/except — crash on partial body + if isinstance(out, dict): out["_seq"] = seq + return out +``` + +If a reader reads the shared memory at the exact moment when the 16-byte header is written but the JSON body is partially written (or not yet written), `json.loads()` receives truncated data and raises `json.JSONDecodeError`. This is **not caught** — the exception propagates up through all read paths: + +- `RealZincPlane.read_slots()` → crash +- `RealZincPlane.read_intents()` → crash +- `RealZincPlane.read_control()` → crash +- `RealZincControlPlane.read()` → crash +- `RealZincPlane.__init__` open path → crash during init + +The header size check (`size > len(buf) - 16`) prevents reading beyond buffer bounds, but it doesn't prevent reading incomplete body data. The writer writes header+body in a single memcpy, so on x86-64 this is unlikely — but on ARM or under heavy memory pressure, the writes can be observed in any order. + +**Severity: High** + +### X8: High — `ExchangeEvent` and `ExchangeEventKind` not exported from `__init__.py` — package API inconsistency + +**File:** `__init__.py:44-88` + +The `__init__.py` exports 45+ names from 12 sub-modules but does **not** export `ExchangeEvent`, `ExchangeEventKind`, or `ExchangePosition`. Consumers import them directly via the raw module path: +```python +from prod.clean_arch.dita_v2.exchange_event import ExchangeEvent +``` + +This is a package hygiene violation. `mypy --strict` flags this. IDE autocomplete fails for these types. If the module is restructured (e.g., `exchange_event.py` renamed to `seam.py`), all direct imports break silently. + +**Severity: High** + +### X9: Medium — No MSRV (`rust-version`) in `Cargo.toml`, no `rust-toolchain.toml` — builds differ per Rust version + +**File:** `_rust_kernel/Cargo.toml` + +```toml +[package] +edition = "2021" +# NO rust-version field +``` + +No `rust-toolchain.toml`, no CI config to pin a Rust version. `cargo build` uses whatever Rust version is on the builder. Cross-machine, cross-developer, cross-deployment builds can produce different binaries. + +The code uses `std::ffi::c_char` (stabilized in Rust 1.64), so building with <1.64 fails. But any version >=1.64 could produce slightly different codegen — and more importantly, if the `.so` from one Rust version is loaded into a Python process that built it with a different Rust version, the ABI may differ. + +**Severity: Medium** + +### X10: Medium — RealZincPlane and RealZincControlPlane both use `{prefix}_control` region name — collision when both are REAL + +**Files:** `real_zinc_plane.py:153`, `real_control_plane.py:72` + +```python +# real_zinc_plane.py: +self.control_region = SharedRegion.create(f"{base}_control", 4096) + +# real_control_plane.py (via region_name): +self.region = SharedRegion.create(f"{base}_control", ...) +``` + +When both `DITA_V2_ZINC=REAL` and `DITA_V2_CONTROL_PLANE=REAL` are set, the launcher creates both `RealZincPlane(prefix="dita_v2")` and `RealZincControlPlane(prefix="dita_v2")`. Both create/open a shared memory region named `"dita_v2_control"`. They write different payload structures to the same region — one overwrites the other's data. + +**Severity: Medium** + +### X11: Medium — Sequence number (`_seq`) is decoded and injected into output dict but never read by any consumer — transmitted waste + +**Files:** `real_zinc_plane.py:128`, `real_control_plane.py:61` + +```python +out["_seq"] = seq # written to output dict +``` + +The sequence number is packed into the 16-byte header, transmitted, decoded, and injected into the output dict — but **no consumer ever reads `"_seq"`**: +- `RealZincPlane.read_slots()` reads `payload.get("slots", [])` — ignores `_seq` +- `RealZincPlane.read_intents()` reads `payload.get("items", [])` — ignores `_seq` +- `RealZincControlPlane.read()` reads `payload.get("control")` — ignores `_seq` + +No gap detection, no staleness check, no ordering verification. The sequence number is dead data on the wire. + +**Severity: Medium** + +### X12: Medium — `_maybe_close()` uses `ThreadPoolExecutor` + `result(timeout=10.0)` — `TimeoutError` unhandled, strand coroutine + +**File:** `launcher.py:63-65` + +```python +pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) +fut = pool.submit(asyncio.run, result) +try: + fut.result(timeout=10.0) # TimeoutError if >10s +except Exception: + pass # catches TimeoutError but coroutine still running +``` + +If the async `close()`/`disconnect()` coroutine takes longer than 10 seconds, `fut.result(timeout=10.0)` raises `TimeoutError`. The `except Exception: pass` catches it — but the coroutine is **still running** in the thread pool. When the coroutine eventually completes, it writes to a `self._closed` event or similar attribute on an object that the caller has already forgotten about. + +On every `_maybe_close` call, a new `ThreadPoolExecutor(1)` is created. If multiple components are closed in sequence, multiple executors are created and never shut down (the `with` block is missing — each `_maybe_close` call creates an executor that's never `.shutdown()`). + +**Severity: Medium** + +### X13: Medium — `__init__.py` re-exports 45 names from 12 modules — flat namespace risks naming collisions + +**File:** `__init__.py:44-88` + +The `__init__.py` flattens all imports into a single namespace. Examples: +- `BingxVenueAdapter` (from `.bingx_venue`) and `MockVenueAdapter` (from `.mock_venue`) — no collision +- `RealZincPlane` (from `.real_zinc_plane`) and `RealZincUnavailable` (from both `.real_zinc_plane` and `.real_control_plane` via alias) — **the alias `RealZincControlUnavailable` avoids this but shows the risk** + +If any two sub-modules export the same name, the second import silently overwrites the first. No warning is raised. + +**Severity: Medium** + +### X14: Medium— `real_control_plane.close()` is not idempotent, no `_closed` guard — double-close depends on C extension behavior + +**File:** `real_control_plane.py:85-86`, `real_zinc_plane.py:187-190` + +```python +# Both implementations: +def close(self) -> None: + self.intent_region.close() # or self.region.close() + # no _closed flag, no guard +``` + +Neither `RealZincPlane.close()` nor `RealZincControlPlane.close()` has a `_closed` guard. Calling `close()` twice calls `SharedRegion.close()` twice on the same region. The Zinc library's C extension behavior on double-close is unknown — it could segfault (use-after-free pattern common with C extensions) or silently return successfully. No Python-side protection. + +Additionally, `close()` does not clear Python-level caches (`_slot_cache`, `_intent_cache`, `_control_cache`). After closing, stale data is still accessible from the cache. + +**Severity: Medium** + +--- + +## Pass 21 Summary + +| # | Flaw | Layer | Severity | +|---|------|-------|----------| +| X1 | No ABI compatibility check on Rust `.so` load — stale binary corrupts silently | Bridge | **Critical** | +| X2 | `real_zinc_plane._write_region()` zeroes entire buffer before write — visible all-zero window | Plane | **Critical** | +| X3 | No `requirements.txt`/`setup.py`/`pyproject.toml` — zero Python dependency declarations | Build | **Critical** | +| X4 | `RealZincControlPlane.update()` no thread lock — concurrent calls corrupt seq and shared memory | Plane | **High** | +| X5 | `libc` declared in `Cargo.toml` but never used — dead dependency | Rust | **High** | +| X6 | 5 test files use hardcoded `sys.path.insert(0, "/mnt/dolphinng5_predict")` — non-portable | Test | **High** | +| X7 | `_decode_packet()` no try/except on `json.loads` — partial body read crashes reader | Plane | **High** | +| X8 | `ExchangeEvent`/`ExchangeEventKind` not exported from `__init__.py` — package API inconsistency | Bridge | **High** | +| X9 | No MSRV or `rust-toolchain.toml` — builds differ per Rust version | Rust | Medium | +| X10 | `RealZincPlane` and `RealZincControlPlane` collide on `{prefix}_control` region name | Plane | Medium | +| X11 | Sequence number decoded but never read by any consumer — dead data on wire | Plane | Medium | +| X12 | `_maybe_close()` `fut.result(timeout=10.0)` — `TimeoutError` leaves coroutine stranded, executor leaks | Launcher | Medium | +| X13 | `__init__.py` flat re-exports 45 names — naming collision risk | Bridge | Medium | +| X14 | `close()` not idempotent on RealZincPlane/RealZincControlPlane — double-close risk | Plane | Medium | + +### Pass 21 Severity + +| Severity | Count | +|----------|-------| +| **Critical** | 3 (X1, X2, X3) | +| **High** | 5 (X4, X5, X6, X7, X8) | +| Medium | 6 (X9, X10, X11, X12, X13, X14) | + +### Combined Catalog (All 21 Passes) + +| Pass | Focus | Count | Critical | High | Medium | Low | Info | +|------|-------|-------|----------|------|--------|-----|------| +| A | Architectural | 15 | 0 | 2 | 0 | 2 | 11 | +| T | Threading/Atomicity | 9 | 1 | 3 | 3 | 2 | 0 | +| E | E2E Trace (Pass 1) | 26 | 0 | 4 | 10 | 11 | 1 | +| F | Deep E2E (Pass 3) | 30 | 0 | 1 | 8 | 17 | 4 | +| G | Domain Scans (Pass 4) | 36 | 4 | 11 | 11 | 8 | 2 | +| H | Edge Domains (Pass 5) | 22 | 3 | 9 | 5 | 4 | 1 | +| I | Pass 6 (Math/Tests/Recovery/Security) | 22 | 3 | 11 | 4 | 2 | 2 | +| J | Pass 7 (Test Infra/Data/Rust/Env/Conn) | 16 | 0 | 7 | 7 | 2 | 0 | +| K | Pass 8 (Observability/Memory/Time/DeadCode) | 23 | 2 | 7 | 7 | 1 | 6 | +| L | Pass 9 (Contracts/Events/Network/FFI/Diffs) | 16 | 0 | 4 | 8 | 4 | 0 | +| M | Pass 10 (Runtime/TestBugs/FSM/Persistence/Metrics) | 18 | 3 | 7 | 5 | 3 | 0 | +| N | Pass 11 (Async/Sync Seams/Locks/Threading) | 10 | 4 | 1 | 3 | 1 | 1 | +| O | Pass 12 (Sync/Async Wider Scope) | 11 | 0 | 3 | 7 | 1 | 0 | +| P | Pass 13 (FFI Safety/Dangling Pointers/Coverage) | 9 | 1 | 3 | 3 | 1 | 1 | +| Q | Pass 14 (Serde Edges/Backup Diffs/Market Data) | 12 | 0 | 4 | 3 | 2 | 3 | +| R | Pass 15 (Resource Leaks/Trust Boundaries/Security) | 14 | 2 | 6 | 3 | 2 | 1 | +| S | Pass 16 (Error Handling/Arithmetic/Test Infra) | 16 | 4 | 7 | 5 | 0 | 0 | +| T | Pass 17 (Unsafe Review/Dead Code/Build/Protocols) | 14 | 0 | 5 | 5 | 4 | 0 | +| U | Pass 18 (Rust Test Gaps/Accounting/FFI Types) | 14 | 3 | 4 | 4 | 3 | 0 | +| V | Pass 19 (Lifecycle/Rust Subtleties/Test Infra) | 14 | 5 | 2 | 4 | 3 | 0 | +| W | Pass 20 (Config/Math Signs/BingX Protocol) | 14 | 4 | 7 | 3 | 0 | 0 | +| X | Pass 21 (Rust Build/Deps/Python Packaging/Shared Mem) | 14 | 3 | 5 | 6 | 0 | 0 | +| **Total** | | **375** | **42** | **113** | **109** | **64** | **37** | diff --git a/PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md b/PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md index 9d3ffc6..0df3eb8 100644 --- a/PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md +++ b/PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md @@ -1,7 +1,7 @@ # PINK DITAv2 — Structural Flaw Analysis (CENTRAL) **Analysis date:** 2026-05-31 -**Last updated:** 2026-06-02 (flaw fix pass 2 — 5 more flaws closed; 13 total) +**Last updated:** 2026-06-02 (flaw fix pass 4 — W10 closed; 17 total fixed) **Scope:** Full PINK pipeline — all flaws across all modules. > **Fix notation:** Rows marked **✅ FIXED ``** are verified-fixed with a test commit on branch `exp/pink-ditav2-sprint0-20260530`. @@ -54,7 +54,8 @@ | U | Pass 18 (Rust Test Gaps/Accounting/FFI Types) | 14 | 3 | 4 | 4 | 3 | 0 | | V | Pass 19 (Lifecycle/Rust Subtleties/Test Infra) | 14 | 5 | 2 | 4 | 3 | 0 | | W | Pass 20 (Config/Math Signs/BingX Protocol) | 14 | 4 | 7 | 3 | 0 | 0 | -| **Total** | | **361** | **39** | **108** | **103** | **64** | **37** | +| X | Pass 21 (Rust Build/Deps/Python Packaging/Shared Mem) | 14 | 3 | 5 | 6 | 0 | 0 | +| **Total** | | **375** | **42** | **113** | **109** | **64** | **37** | --- @@ -363,6 +364,14 @@ | N9 | No `asyncio.all_tasks()` or task accounting — leaked tasks undetectable | All | Low | | N10 | `_snap_lock` no reader-side protection (informational) | Venue | Info | +### Fixes applied (2026-06-02 pass 3) + +| Flaw | Commit | What changed | +|------|--------|--------------| +| V1 — `LauncherBundle.close()` missing `kernel.close()` | `8d9762c` | `self.kernel.close()` wired into bundle teardown; Rust handle deterministically destroyed | +| V2 — `BingxVenueAdapter` no `close()` | `8d9762c` | `close()` added; shuts down class-level `ThreadPoolExecutor` + delegates to `backend.close()` | +| V3 — `seen_event_ids` not cleared on slot reuse | `8d9762c` | `slot.seen_event_ids.clear()` added to ENTER handler in Rust kernel; fill dedup no longer pollutes across trades | + --- ## O-Series: Sync/Async Wider Scope (Launcher, Generators, Streams, FFI, Tests) (Pass 12) @@ -524,9 +533,9 @@ | # | Flaw | Layer | Severity | |---|------|-------|----------| -| V1 | `DITAv2LauncherBundle.close()` never calls `kernel.close()` — Rust handle leaks via `__del__` | Launcher | **Critical** | -| V2 | `BingxVenueAdapter` no `close()`/`disconnect()` — ThreadPoolExecutor/HTTP never release | Venue | **Critical** | -| V3 | `process_intent` ENTER doesn't clear `seen_event_ids` — old dedup pollutes new trade | Rust | **High** | +| V1 | `DITAv2LauncherBundle.close()` never calls `kernel.close()` — Rust handle leaks via `__del__` — **✅ FIXED `8d9762c`** | Launcher | **Critical** | +| V2 | `BingxVenueAdapter` no `close()`/`disconnect()` — ThreadPoolExecutor/HTTP never release — **✅ FIXED `8d9762c`** | Venue | **Critical** | +| V3 | `process_intent` ENTER doesn't clear `seen_event_ids` — old dedup pollutes new trade — **✅ FIXED `8d9762c`** | Rust | **High** | | V4 | 3 generators write same output file — last writer wins, incompatible prologues | Test | **Critical** | | V5 | Generated tests triple env-gated — never run in CI, dead code | Test | **Critical** | | V6 | `kernel.close()` destroys Rust handle immediately — no drain, no flush, UAF risk | Bridge | **Critical** | @@ -564,6 +573,29 @@ --- +## X-Series: Rust Build/Deps, Python Packaging, Shared Memory Protocol (Pass 21) + +*Full detail in TRACE doc under "PASS 21 — RUST BUILD/DEPS, PYTHON PACKAGING, SHARED MEMORY PROTOCOL."* + +| # | Flaw | Layer | Severity | +|---|------|-------|----------| +| X1 | No ABI compatibility check on Rust `.so` load — stale binary corrupts silently | Bridge | **Critical** | +| X2 | `real_zinc_plane._write_region()` zeroes entire buffer before write — visible all-zero window | Plane | **Critical** | +| X3 | No `requirements.txt`/`setup.py`/`pyproject.toml` — zero Python dependency declarations | Build | **Critical** | +| X4 | `RealZincControlPlane.update()` no thread lock — concurrent calls corrupt seq and shared memory | Plane | **High** | +| X5 | `libc` declared in `Cargo.toml` but never used — dead dependency | Rust | **High** | +| X6 | 5 test files use hardcoded `sys.path.insert(0, "/mnt/dolphinng5_predict")` — non-portable | Test | **High** | +| X7 | `_decode_packet()` no try/except on `json.loads` — partial body read crashes reader | Plane | **High** | +| X8 | `ExchangeEvent`/`ExchangeEventKind` not exported from `__init__.py` | Bridge | **High** | +| X9 | No MSRV or `rust-toolchain.toml` — builds differ per Rust version | Rust | Medium | +| X10 | RealZincPlane and RealZincControlPlane collide on `{prefix}_control` region name | Plane | Medium | +| X11 | Sequence number decoded but never read by any consumer — dead data on wire | Plane | Medium | +| X12 | `_maybe_close()` `fut.result(timeout=10.0)` — TimeoutError strands coroutine | Launcher | Medium | +| X13 | `__init__.py` flat re-exports 45 names — naming collision risk | Bridge | Medium | +| X14 | `close()` not idempotent on RealZincPlane/RealZincControlPlane | Plane | Medium | + +--- + ## H-Series: Edge Domains — Dependencies, Error Handling, Types, Contracts (Pass 5) *Full detail in TRACE doc under "PASS 5 — EDGE DOMAINS."* diff --git a/prod/clean_arch/dita_v2/PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md b/prod/clean_arch/dita_v2/PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md index fe2a92d..5fdf98c 100644 --- a/prod/clean_arch/dita_v2/PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md +++ b/prod/clean_arch/dita_v2/PINK_DITAv2_FLAW_ANALYSIS_2026-05-31.md @@ -1,7 +1,7 @@ # PINK DITAv2 — Structural Flaw Analysis (CENTRAL) **Analysis date:** 2026-05-31 -**Last updated:** 2026-06-02 (flaw fix pass 2 — 5 more flaws closed; 13 total) +**Last updated:** 2026-06-02 (flaw fix pass 4 — W10 closed; 17 total fixed) **Scope:** Full PINK pipeline — all flaws across all modules. > **Fix notation:** Rows marked **✅ FIXED ``** are verified-fixed with a test commit on branch `exp/pink-ditav2-sprint0-20260530`. @@ -53,7 +53,9 @@ | T | Pass 17 (Unsafe Review/Dead Code/Build/Protocols) | 14 | 0 | 5 | 5 | 4 | 0 | | U | Pass 18 (Rust Test Gaps/Accounting/FFI Types) | 14 | 3 | 4 | 4 | 3 | 0 | | V | Pass 19 (Lifecycle/Rust Subtleties/Test Infra) | 14 | 5 | 2 | 4 | 3 | 0 | -| **Total** | | **347** | **35** | **101** | **100** | **64** | **37** | +| W | Pass 20 (Config/Math Signs/BingX Protocol) | 14 | 4 | 7 | 3 | 0 | 0 | +| X | Pass 21 (Rust Build/Deps/Python Packaging/Shared Mem) | 14 | 3 | 5 | 6 | 0 | 0 | +| **Total** | | **375** | **42** | **113** | **109** | **64** | **37** | --- @@ -362,6 +364,20 @@ | N9 | No `asyncio.all_tasks()` or task accounting — leaked tasks undetectable | All | Low | | N10 | `_snap_lock` no reader-side protection (informational) | Venue | Info | +### Fixes applied (2026-06-02 pass 3) + +| Flaw | Commit | What changed | +|------|--------|--------------| +| V1 — `LauncherBundle.close()` missing `kernel.close()` | `8d9762c` | `self.kernel.close()` wired into bundle teardown; Rust handle deterministically destroyed | +| V2 — `BingxVenueAdapter` no `close()` | `8d9762c` | `close()` added; shuts down class-level `ThreadPoolExecutor` + delegates to `backend.close()` | +| V3 — `seen_event_ids` not cleared on slot reuse | `8d9762c` | `slot.seen_event_ids.clear()` added to ENTER handler in Rust kernel; fill dedup no longer pollutes across trades | + +### Fixes applied (2026-06-02 pass 4) + +| Flaw | Commit | What changed | +|------|--------|--------------| +| W10 — `BingxHttpError` blindly mapped to "REJECTED" | `e90d542` | `_http_error_status()` helper: 429/5xx/transport → RATE_LIMITED; 4xx → REJECTED | + --- ## O-Series: Sync/Async Wider Scope (Launcher, Generators, Streams, FFI, Tests) (Pass 12) @@ -523,9 +539,9 @@ | # | Flaw | Layer | Severity | |---|------|-------|----------| -| V1 | `DITAv2LauncherBundle.close()` never calls `kernel.close()` — Rust handle leaks via `__del__` | Launcher | **Critical** | -| V2 | `BingxVenueAdapter` no `close()`/`disconnect()` — ThreadPoolExecutor/HTTP never release | Venue | **Critical** | -| V3 | `process_intent` ENTER doesn't clear `seen_event_ids` — old dedup pollutes new trade | Rust | **High** | +| V1 | `DITAv2LauncherBundle.close()` never calls `kernel.close()` — Rust handle leaks via `__del__` — **✅ FIXED `8d9762c`** | Launcher | **Critical** | +| V2 | `BingxVenueAdapter` no `close()`/`disconnect()` — ThreadPoolExecutor/HTTP never release — **✅ FIXED `8d9762c`** | Venue | **Critical** | +| V3 | `process_intent` ENTER doesn't clear `seen_event_ids` — old dedup pollutes new trade — **✅ FIXED `8d9762c`** | Rust | **High** | | V4 | 3 generators write same output file — last writer wins, incompatible prologues | Test | **Critical** | | V5 | Generated tests triple env-gated — never run in CI, dead code | Test | **Critical** | | V6 | `kernel.close()` destroys Rust handle immediately — no drain, no flush, UAF risk | Bridge | **Critical** | @@ -540,6 +556,52 @@ --- +## W-Series: Configuration Management, Math Sign Conventions, BingX Protocol (Pass 20) + +*Full detail in TRACE doc under "PASS 20 — CONFIGURATION MANAGEMENT, MATH SIGN CONVENTIONS, BINGX PROTOCOL."* + +| # | Flaw | Layer | Severity | +|---|------|-------|----------| +| W1 | `int()` on 3 env vars uncaught `ValueError` — non-numeric input crashes process | Config | **Critical** | +| W2 | `DITA_V2_PREFIX` default `"dita_v2"` — multi-process shared memory corruption | Config | **Critical** | +| W3 | Funding sign opposite Python V2 vs Rust — same raw value opposite capital effect | Accounting | **Critical** | +| W4 | `listenKeyExpired` frames silently swallowed — `continue` skips expiry check, dead code | Venue | **Critical** | +| W5 | `RECV_WINDOW_MS` no upper bound — extreme values enable replay attacks | Config | **High** | +| W6 | `ACTIVE_SLOT_LIMIT` stored but never enforced by Rust kernel — dead config | Config | **High** | +| W7 | No fill history fetched during WS reconnect gap-backfill — fills permanently lost | Venue | **High** | +| W8 | Rate limit detection fails on HTTP 429 without matching message — returns 0 instant retry | Venue | **High** | +| W9 | `CONTROL_PLANE=REAL_ZINC` silently falls back to in-memory — no persistence | Config | **High** | +| W10 | All `BingxHttpError` mapped to "REJECTED" — can't distinguish errors from real rejections — **✅ FIXED `e90d542`** | Venue | **High** | +| W11 | `os.environ["KEY"]` bracket access in tests vs `.get()` in launcher — inconsistent | Test | **High** | +| W12 | `MockVenueScenario` no `rate_limit` flag — RATE_LIMITED path untested in CI | Test | Medium | +| W13 | Rate-limit regex uses English phrase `"unblocked after"` — non-portable | Venue | Medium | +| W14 | Invalid `ACTIVE_SLOT_LIMIT` values silently discarded — no log, no warning | Config | Medium | + +--- + +## X-Series: Rust Build/Deps, Python Packaging, Shared Memory Protocol (Pass 21) + +*Full detail in TRACE doc under "PASS 21 — RUST BUILD/DEPS, PYTHON PACKAGING, SHARED MEMORY PROTOCOL."* + +| # | Flaw | Layer | Severity | +|---|------|-------|----------| +| X1 | No ABI compatibility check on Rust `.so` load — stale binary corrupts silently | Bridge | **Critical** | +| X2 | `real_zinc_plane._write_region()` zeroes entire buffer before write — visible all-zero window | Plane | **Critical** | +| X3 | No `requirements.txt`/`setup.py`/`pyproject.toml` — zero Python dependency declarations | Build | **Critical** | +| X4 | `RealZincControlPlane.update()` no thread lock — concurrent calls corrupt seq and shared memory | Plane | **High** | +| X5 | `libc` declared in `Cargo.toml` but never used — dead dependency | Rust | **High** | +| X6 | 5 test files use hardcoded `sys.path.insert(0, "/mnt/dolphinng5_predict")` — non-portable | Test | **High** | +| X7 | `_decode_packet()` no try/except on `json.loads` — partial body read crashes reader | Plane | **High** | +| X8 | `ExchangeEvent`/`ExchangeEventKind` not exported from `__init__.py` | Bridge | **High** | +| X9 | No MSRV or `rust-toolchain.toml` — builds differ per Rust version | Rust | Medium | +| X10 | RealZincPlane and RealZincControlPlane collide on `{prefix}_control` region name | Plane | Medium | +| X11 | Sequence number decoded but never read by any consumer — dead data on wire | Plane | Medium | +| X12 | `_maybe_close()` `fut.result(timeout=10.0)` — TimeoutError strands coroutine | Launcher | Medium | +| X13 | `__init__.py` flat re-exports 45 names — naming collision risk | Bridge | Medium | +| X14 | `close()` not idempotent on RealZincPlane/RealZincControlPlane | Plane | Medium | + +--- + ## H-Series: Edge Domains — Dependencies, Error Handling, Types, Contracts (Pass 5) *Full detail in TRACE doc under "PASS 5 — EDGE DOMAINS."* diff --git a/prod/clean_arch/dita_v2/_rust_kernel/src/lib.rs b/prod/clean_arch/dita_v2/_rust_kernel/src/lib.rs index 41dd4ac..d59e884 100644 --- a/prod/clean_arch/dita_v2/_rust_kernel/src/lib.rs +++ b/prod/clean_arch/dita_v2/_rust_kernel/src/lib.rs @@ -1275,6 +1275,10 @@ impl KernelCore { slot.active_exit_order = None; slot.close_reason.clear(); slot.closed = false; + // V3: clear per-slot dedup set so events from a previous trade on + // this slot cannot falsely deduplicate events for the new trade. + // (The venue adapter's _event_seq resets on restart, so IDs repeat.) + slot.seen_event_ids.clear(); slot.last_event_time = None; slot.fsm_state = TradeStage::ORDER_REQUESTED; slot.attach_entry_order(VenueOrder { diff --git a/prod/clean_arch/dita_v2/bingx_venue.py b/prod/clean_arch/dita_v2/bingx_venue.py index 9c8de29..b5b1c50 100644 --- a/prod/clean_arch/dita_v2/bingx_venue.py +++ b/prod/clean_arch/dita_v2/bingx_venue.py @@ -77,6 +77,24 @@ def _trade_side_from_row(row: dict[str, Any], *, fallback: TradeSide = TradeSide return fallback +def _http_error_status(exc_msg: str) -> str: + """Map a BingxHttpError message to a venue status string. + + HTTP 429 and 5xx are transient → RATE_LIMITED so the slot can retry. + 4xx (non-429) are genuine client-side rejections → REJECTED. + Transport / DNS / circuit-breaker errors (no HTTP prefix) are transient. + """ + m = exc_msg.upper() + if "HTTP 429" in m: + return "RATE_LIMITED" + for code in ("500", "501", "502", "503", "504"): + if f"HTTP {code}" in m: + return "RATE_LIMITED" + if "HTTP 4" in m: + return "REJECTED" + return "RATE_LIMITED" + + def _venue_event_status_from_row(status: str) -> VenueEventStatus: normalized = _normalize_status(status) if normalized in {"NEW", "ACKED", "PENDING", "CREATED"}: @@ -246,6 +264,21 @@ class BingxVenueAdapter(VenueAdapter): ) from exc return result + def close(self) -> None: + """V2: release the class-level thread-pool and any backend HTTP session.""" + executor = self.__class__._EXECUTOR + if executor is not None: + with self.__class__._EXECUTOR_LOCK: + if self.__class__._EXECUTOR is executor: + self.__class__._EXECUTOR = None + executor.shutdown(wait=False) + _maybe_close_backend = getattr(self.backend, "close", None) + if _maybe_close_backend is not None: + try: + _maybe_close_backend() + except Exception: + pass + def _call_backend(self, method_name: str, *args: Any, **kwargs: Any) -> Any: method = getattr(self.backend, method_name, None) if method is None: @@ -347,7 +380,8 @@ class BingxVenueAdapter(VenueAdapter): try: response = self._run(client.signed_delete("/openApi/swap/v2/trade/order", params)) except BingxHttpError as exc: - response = {"status": "REJECTED", "msg": str(exc), "orderId": order.venue_order_id, "clientOrderId": order.venue_client_id} + # W10: map HTTP error class to status — 429/5xx are transient, 4xx are real rejections + response = {"status": _http_error_status(str(exc)), "msg": str(exc), "orderId": order.venue_order_id, "clientOrderId": order.venue_client_id} snapshot_after = self._backend_snapshot(include_history=True) return self._events_from_cancel(order, response, snapshot_before, snapshot_after, reason=reason) diff --git a/prod/clean_arch/dita_v2/launcher.py b/prod/clean_arch/dita_v2/launcher.py index df859c9..2fb185e 100644 --- a/prod/clean_arch/dita_v2/launcher.py +++ b/prod/clean_arch/dita_v2/launcher.py @@ -75,6 +75,8 @@ class DITAv2LauncherBundle: _maybe_close(self.venue) _maybe_close(self.zinc_plane) _maybe_close(self.control_plane) + # V1: kernel.close() was added (O10) but never wired into bundle teardown. + self.kernel.close() def _env_upper(name: str, default: str = "") -> str: diff --git a/prod/clean_arch/dita_v2/test_flaws.py b/prod/clean_arch/dita_v2/test_flaws.py index a1fae88..8135134 100644 --- a/prod/clean_arch/dita_v2/test_flaws.py +++ b/prod/clean_arch/dita_v2/test_flaws.py @@ -970,6 +970,97 @@ class TestO1MaybeCloseAsyncSafe: assert closed == [True], "sync close() must still be called" +# ============================================================ +# V3: seen_event_ids must be cleared on slot reuse (ENTER after CLOSE) +# ============================================================ + +class TestV3SeenEventIdsClearedOnReuse: + """V3: If seen_event_ids from a previous trade survive into the next trade + on the same slot, events whose IDs happen to match will be silently dropped. + This is guaranteed after a restart because the venue adapter's _event_seq + resets to 1, so EV-00000001 collides with the old trade's first event.""" + + def test_second_trade_fill_not_deduped(self): + """Fill on a reused slot must not be swallowed by stale dedup set.""" + k = _fresh_kernel() + + # Trade 1: enter and exit + k.process_intent(_mk_intent(action=E.ENTER, trade_id="v3-t1")) + k.process_intent(_mk_intent(action=E.EXIT, trade_id="v3-t1")) + assert k._get_slot(0).is_free(), "Trade 1 must close cleanly" + + # Trade 2 on the same slot — inject a fill with an event_id that + # matches what the venue adapter would assign after a restart (EV-00000001). + k.process_intent(_mk_intent(action=E.ENTER, trade_id="v3-t2")) + fill = _mk_venue_event( + kind=KernelEventKind.FULL_FILL, + trade_id="v3-t2", + event_id="EV-00000001", # same ID the adapter emits on restart + price=100.0, + size=1.0, + filled_size=1.0, + ) + result = k.on_venue_event(fill) + + slot = k._get_slot(0) + assert slot.fsm_state == TradeStage.POSITION_OPEN, ( + f"V3: fill for trade 2 must not be deduped — got {slot.fsm_state}, " + f"diagnostic={result.diagnostic_code}" + ) + + def test_seen_event_ids_smaller_after_slot_reuse(self): + """After a trade closes and a new ENTER starts, seen_event_ids must + contain only IDs from the new trade — not the accumulated IDs from the + prior trade on the same slot.""" + # Auto-fill kernel: each trade generates ~2 events (ORDER_ACK + FILL). + k = _fresh_kernel() + k.process_intent(_mk_intent(action=E.ENTER, trade_id="v3-s1")) + k.process_intent(_mk_intent(action=E.EXIT, trade_id="v3-s1")) + assert k._get_slot(0).is_free(), "Seed trade must close cleanly" + ids_after_seed = list(k._get_slot(0).seen_event_ids) + # Trade 1 generated at least 2 events (ORDER_ACK + FULL_FILL × 2) + assert len(ids_after_seed) >= 2, "Seed trade must have populated seen_event_ids" + + # Fresh ENTER on same slot (auto-fills → adds ~2 more events). + k.process_intent(_mk_intent(action=E.ENTER, trade_id="v3-s2")) + ids_after_fresh = list(k._get_slot(0).seen_event_ids) + + # With V3 fix: fresh trade starts from 0 then adds its own events → small count. + # Without fix: old IDs remain → count = len(ids_after_seed) + new_events. + assert len(ids_after_fresh) < len(ids_after_seed), ( + f"V3: seen_event_ids must be cleared on ENTER. " + f"After seed: {len(ids_after_seed)} IDs, after fresh ENTER: {len(ids_after_fresh)} IDs. " + f"Expected fewer, not more." + ) + + +# ============================================================ +# V1+V2: LauncherBundle.close() wires kernel; BingxVenueAdapter.close() +# ============================================================ + +class TestV1V2LauncherAndVenueClose: + """V1: LauncherBundle.close() must call kernel.close(). + V2: BingxVenueAdapter.close() must exist and release the thread pool.""" + + def test_launcher_bundle_close_calls_kernel_close(self): + """V1: kernel._backend must be None after bundle.close().""" + from prod.clean_arch.dita_v2.launcher import build_launcher_bundle + bundle = build_launcher_bundle(max_slots=2) + assert bundle.kernel._backend is not None + bundle.close() + assert bundle.kernel._backend is None, ( + "V1: LauncherBundle.close() must call kernel.close()" + ) + + def test_bingx_venue_adapter_has_close(self): + """V2: BingxVenueAdapter must have a close() method.""" + from prod.clean_arch.dita_v2.bingx_venue import BingxVenueAdapter + adapter = object.__new__(BingxVenueAdapter) + assert callable(getattr(adapter, "close", None)), ( + "V2: BingxVenueAdapter must have a close() method" + ) + + # ============================================================ # M9: ORDER_REJECT must NOT nuke a live POSITION_OPEN slot # ============================================================ @@ -1118,3 +1209,46 @@ class TestH6SafeEnum: from prod.clean_arch.dita_v2.rust_backend import _safe_enum result = _safe_enum(TradeStage, "", TradeStage.IDLE) assert result == TradeStage.IDLE + + +# ============================================================ +# W10: BingxHttpError must not be blindly mapped to "REJECTED" +# ============================================================ + +class TestW10HttpErrorMapping: + """W10: _http_error_status() must distinguish transient errors (429, 5xx, + DNS/transport) from genuine 4xx rejections — so the kernel sees RATE_LIMITED + vs CANCEL_REJECT for the right cases.""" + + def _status(self, msg: str) -> str: + from prod.clean_arch.dita_v2.bingx_venue import _http_error_status + return _http_error_status(msg) + + def test_429_is_rate_limited(self): + assert self._status("HTTP 429: Too Many Requests") == "RATE_LIMITED", ( + "W10: 429 must map to RATE_LIMITED, not REJECTED" + ) + + def test_503_is_rate_limited(self): + assert self._status("HTTP 503: Service Unavailable") == "RATE_LIMITED", ( + "W10: 503 (transient server error) must map to RATE_LIMITED" + ) + + def test_500_is_rate_limited(self): + assert self._status("HTTP 500: Internal Server Error") == "RATE_LIMITED" + + def test_400_is_rejected(self): + assert self._status("HTTP 400: invalid symbol") == "REJECTED", ( + "W10: genuine 4xx client error must map to REJECTED" + ) + + def test_403_is_rejected(self): + assert self._status("HTTP 403: Forbidden") == "REJECTED" + + def test_transport_error_is_rate_limited(self): + assert self._status("DELETE /openApi/swap/v2/trade/order failed: ConnectionError") == "RATE_LIMITED", ( + "W10: DNS/transport errors (no HTTP prefix) must map to RATE_LIMITED" + ) + + def test_dns_error_is_rate_limited(self): + assert self._status("Name or service not known") == "RATE_LIMITED"