Skip to content

Commit 9798a59

Browse files
feat: add flatten opt-out + fix field names containing >
GenericOptions dataclass with no_flatten to disable nested object flattening. Implements spec rule 7.4.6.1.4: fields with > in names excluded from tabular columns, emitted as per-row attachments. Decoder accepts orphan and scalar attachments, no longer splits literal > as path separator. 200K round-trips (both modes, > in keys) zero failures. 12 edge cases.
1 parent 647e4e3 commit 9798a59

8 files changed

Lines changed: 195 additions & 70 deletions

File tree

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
# Changelog
22

3+
## v2.2.1 (2026-06-23)
4+
5+
### Flatten Opt-Out
6+
7+
- Added `GenericOptions` dataclass with `no_flatten` field to disable nested object flattening
8+
- `encode_generic(data, GenericOptions(no_flatten=True))` produces attachment syntax instead of path columns
9+
- Backward compatible: `encode_generic(data)` behavior unchanged (flatten on by default)
10+
- Fixed: field names containing `>` no longer appear as tabular columns (spec rule 7.4.6.1.4)
11+
- Fixed: field names containing `>` no longer eligible for flattening analysis
12+
- Fixed: decoder no longer treats literal `>` in key names as a path separator
13+
- Fixed: decoder accepts orphan attachments (fields excluded from column list)
14+
- 12 targeted edge case tests for `>` in field names
15+
316
## v2.2.0 (2026-06-22)
417

518
### Spec v3.2: Nested Object Flattening

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "gcf-python"
7-
version = "2.2.0"
7+
version = "2.2.1"
88
description = "The AI-native wire format for structured data. 50-92% fewer tokens than JSON. 100% comprehension on every frontier model. Zero dependencies."
99
readme = "README.md"
1010
license = {text = "MIT"}

src/gcf/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
from .decode import DecodeError, decode
3939
from .delta import encode_delta
4040
from .encode import encode
41-
from .generic import encode_generic
41+
from .generic import encode_generic, GenericOptions
4242
from .session import Session, encode_with_session
4343
from .decode_generic import decode_generic
4444
from .stream import StreamEncoder
@@ -62,6 +62,7 @@
6262
"encode",
6363
"encode_delta",
6464
"encode_generic",
65+
"GenericOptions",
6566
"encode_with_session",
6667
]
6768

src/gcf/decode_generic.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,14 @@ def _parse_attachment(
360360
arr, consumed = _parse_array_from_header(lines, line_idx, depth, after_name)
361361
return name, arr, consumed, None
362362

363+
# Scalar: =value (field names containing ">" excluded from tabular columns).
364+
if after_name.startswith("="):
365+
val_str = after_name[1:]
366+
parsed = parse_scalar(val_str, tabular_context=True)
367+
if parsed is MISSING:
368+
return name, None, 1, None
369+
return name, parsed, 1, None
370+
363371
raise ValueError(f"invalid attachment form: {after_name}")
364372

365373

@@ -423,7 +431,11 @@ def _parse_tabular_body(
423431
path_column_map: dict[str, list[str]] = {}
424432
for f in fields:
425433
if ">" in f:
426-
path_column_map[f] = f.split(">")
434+
parts = f.split(">")
435+
# Only treat as a path column if all segments are non-empty.
436+
# A literal key like ">" would split into ["", ""].
437+
if all(p for p in parts):
438+
path_column_map[f] = parts
427439

428440
# Track inline schemas and shared array schemas.
429441
inline_schemas: dict[str, list[str]] = {}
@@ -506,10 +518,10 @@ def _parse_tabular_body(
506518
all_att_fields = traditional_att_fields + inline_att_fields
507519
attachment_values: dict[str, Any] = {}
508520

509-
if row_has_id and all_att_fields:
521+
if row_has_id:
510522
inline_idx = 0
511523

512-
while i < len(lines) and len(attachment_values) < len(all_att_fields):
524+
while i < len(lines):
513525
a_line = lines[i]
514526
a_content: str | None = None
515527
if depth == 0 or a_line.startswith(ind):
@@ -601,13 +613,6 @@ def _parse_tabular_body(
601613
if extra_name in attachment_values:
602614
raise ValueError(f"duplicate_attachment: {extra_name}")
603615

604-
if not row_has_id or not all_att_fields:
605-
att_indent = ind + " "
606-
if i < len(lines) and lines[i].startswith(att_indent):
607-
peek = lines[i][len(att_indent):]
608-
if peek.startswith("."):
609-
raise ValueError(f"orphan_attachment: {peek}")
610-
611616
row: dict[str, Any] = {}
612617
for f in fields:
613618
if f in missing_fields:
@@ -616,6 +621,10 @@ def _parse_tabular_body(
616621
row[f] = cell_values[f]
617622
elif f in attachment_values:
618623
row[f] = attachment_values[f]
624+
# Also add any orphan attachment values (fields excluded from column list, e.g. ">" fields).
625+
for k, v in attachment_values.items():
626+
if k not in row:
627+
row[k] = v
619628
# Unflatten path columns into nested objects.
620629
if path_column_map:
621630
nested = _unflatten_paths(path_column_map, flat_values, flat_absent)

src/gcf/generic.py

Lines changed: 83 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,55 @@
22

33
from __future__ import annotations
44

5+
from dataclasses import dataclass
56
from typing import Any
67

78
from .scalar import format_scalar, format_key
89

910

10-
def encode_generic(data: Any) -> str:
11+
@dataclass
12+
class GenericOptions:
13+
"""Options for controlling generic encoding behavior."""
14+
no_flatten: bool = False
15+
"""When True, disables promotion of fixed-shape nested objects to path
16+
columns (e.g. "customer>name"). Nested objects use attachment syntax
17+
instead. Set when targeting open-weight models that show lower
18+
comprehension on flattened encoding."""
19+
20+
21+
def encode_generic(data: Any, opts: GenericOptions | None = None) -> str:
22+
if opts is None:
23+
opts = GenericOptions()
1124
out: list[str] = ["GCF profile=generic"]
12-
_encode_root_value(data, out)
25+
_encode_root_value(data, out, opts)
1326
return "\n".join(out) + "\n"
1427

1528

16-
def _encode_root_value(v: Any, out: list[str]) -> None:
29+
def _encode_root_value(v: Any, out: list[str], opts: GenericOptions) -> None:
1730
if v is None:
1831
out.append("=-")
1932
elif isinstance(v, dict):
20-
_encode_object(v, out, 0)
33+
_encode_object(v, out, 0, opts)
2134
elif isinstance(v, list):
22-
_encode_root_array(v, out)
35+
_encode_root_array(v, out, opts)
2336
else:
2437
out.append(f"={format_scalar(v)}")
2538

2639

27-
def _encode_object(d: dict, out: list[str], depth: int) -> None:
40+
def _encode_object(d: dict, out: list[str], depth: int, opts: GenericOptions) -> None:
2841
prefix = _indent(depth)
2942
for key, value in d.items():
3043
fk = format_key(key)
3144
if isinstance(value, dict):
3245
out.append(f"{prefix}## {fk}")
33-
_encode_object(value, out, depth + 1)
46+
_encode_object(value, out, depth + 1, opts)
3447
elif isinstance(value, list):
35-
_encode_named_array(fk, value, out, depth)
48+
_encode_named_array(fk, value, out, depth, opts)
3649
else:
3750
out.append(f"{prefix}{fk}={format_scalar(value)}")
3851

3952

40-
def _encode_root_array(arr: list, out: list[str]) -> None:
53+
def _encode_root_array(arr: list, out: list[str], opts: GenericOptions) -> None:
4154
if not arr:
4255
out.append("## [0]")
4356
return
@@ -47,12 +60,12 @@ def _encode_root_array(arr: list, out: list[str]) -> None:
4760
return
4861
fields = _tabular_fields(arr)
4962
if fields is not None:
50-
_encode_tabular("## ", arr, fields, out, 0)
63+
_encode_tabular("## ", arr, fields, out, 0, opts)
5164
return
52-
_encode_expanded("## ", arr, out, 0)
65+
_encode_expanded("## ", arr, out, 0, opts)
5366

5467

55-
def _encode_named_array(name: str, arr: list, out: list[str], depth: int) -> None:
68+
def _encode_named_array(name: str, arr: list, out: list[str], depth: int, opts: GenericOptions) -> None:
5669
prefix = _indent(depth)
5770
if not arr:
5871
out.append(f"{prefix}## {name} [0]")
@@ -63,9 +76,9 @@ def _encode_named_array(name: str, arr: list, out: list[str], depth: int) -> Non
6376
return
6477
fields = _tabular_fields(arr)
6578
if fields is not None:
66-
_encode_tabular(f"{prefix}## {name} ", arr, fields, out, depth)
79+
_encode_tabular(f"{prefix}## {name} ", arr, fields, out, depth, opts)
6780
return
68-
_encode_expanded(f"{prefix}## {name} ", arr, out, depth)
81+
_encode_expanded(f"{prefix}## {name} ", arr, out, depth, opts)
6982

7083

7184
def _tabular_fields(arr: list) -> list[str] | None:
@@ -152,6 +165,9 @@ def _analyze_flattenable(
152165
arr: list[dict], field_name: str, parent_path: str
153166
) -> list[dict] | None:
154167
"""Analyze whether a field can be flattened. Returns list of leaf descriptors or None."""
168+
# Field names containing ">" cannot be flattened (would create ambiguous paths).
169+
if ">" in field_name:
170+
return None
155171
canonical_shape: dict[str, str] | None = None # key -> "scalar" | "nested"
156172

157173
for item in arr:
@@ -249,26 +265,39 @@ def _resolve_key_chain(item: Any, keys: list[str]) -> tuple[Any, bool]:
249265

250266

251267
def _encode_tabular(
252-
header_prefix: str, arr: list[dict], fields: list[str], out: list[str], depth: int
268+
header_prefix: str, arr: list[dict], fields: list[str], out: list[str], depth: int, opts: GenericOptions
253269
) -> None:
254270
prefix = _indent(depth)
255271

256272
# Phase 0: Analyze fields for flattening.
257273
flatten_map: dict[str, list[dict]] = {}
258-
for f in fields:
259-
leaves = _analyze_flattenable(arr, f, "")
260-
if leaves and len(leaves) > 0:
261-
flatten_map[f] = leaves
274+
if not opts.no_flatten:
275+
for f in fields:
276+
leaves = _analyze_flattenable(arr, f, "")
277+
if leaves and len(leaves) > 0:
278+
flatten_map[f] = leaves
279+
280+
# Fields whose names contain ">" must not appear as tabular columns
281+
# because the decoder would interpret them as flattened path columns.
282+
# Track them for per-row attachment emission (spec rule 7.4.6.1.4).
283+
gt_fields = {f for f in fields if f not in flatten_map and ">" in f}
262284

263285
# Build expanded column list.
264286
columns: list[dict] = []
265287
for f in fields:
288+
if f in gt_fields:
289+
continue
266290
if f in flatten_map:
267291
for leaf in flatten_map[f]:
268292
columns.append({"header": format_key(leaf["path"]), "type": "flat", "field": f, "keys": leaf["keys"]})
269293
else:
270294
columns.append({"header": format_key(f), "type": "original", "field": f, "keys": []})
271295

296+
# If all fields were excluded (all contain ">"), fall back to expanded.
297+
if not columns:
298+
_encode_expanded(header_prefix, arr, out, depth, opts)
299+
return
300+
272301
# Pre-compute inline schemas and shared array schemas (skip flattened fields).
273302
inline_schemas: dict[str, list[str]] = {}
274303
shared_arr_schemas: dict[str, list[str]] = {}
@@ -333,6 +362,15 @@ def _encode_tabular(
333362
else:
334363
cells.append(format_scalar(v, "|"))
335364

365+
# Emit fields with ">" in their names as per-row attachments.
366+
for f in fields:
367+
if f not in gt_fields:
368+
continue
369+
if f not in item:
370+
continue
371+
row_has_attachment = True
372+
attachments.append((f, item[f], False, None))
373+
336374
row = "|".join(cells)
337375
if row_has_attachment:
338376
out.append(f"{prefix}@{i} {row}")
@@ -351,17 +389,25 @@ def _encode_tabular(
351389
elif isinstance(att_val, list):
352390
sas = shared_arr_schemas.get(att_name)
353391
if sas and i > 0:
354-
_encode_attachment_array_shared(prefix, fk, att_val, out, depth + 2, sas)
392+
_encode_attachment_array_shared(prefix, fk, att_val, out, depth + 2, sas, opts)
355393
else:
356-
_encode_attachment_array(prefix, fk, att_val, out, depth + 2)
394+
_encode_attachment_array(prefix, fk, att_val, out, depth + 2, opts)
357395
elif isinstance(att_val, dict):
358396
out.append(f"{prefix}.{fk} {{}}")
359-
_encode_object(att_val, out, depth + 2)
397+
_encode_object(att_val, out, depth + 2, opts)
398+
else:
399+
# Scalar attachment (e.g. field names containing ">").
400+
if att_val is None:
401+
out.append(f"{prefix}.{fk} =-")
402+
else:
403+
out.append(f"{prefix}.{fk} ={format_scalar(att_val)}")
360404

361405

362406
def _encode_attachment_array(
363-
att_prefix: str, fk: str, arr: list, out: list[str], depth: int
407+
att_prefix: str, fk: str, arr: list, out: list[str], depth: int, opts: GenericOptions | None = None
364408
) -> None:
409+
if opts is None:
410+
opts = GenericOptions()
365411
if not arr:
366412
out.append(f"{att_prefix}.{fk} [0]")
367413
elif _all_primitives(arr):
@@ -370,13 +416,13 @@ def _encode_attachment_array(
370416
else:
371417
fields = _tabular_fields(arr)
372418
if fields is not None:
373-
_encode_tabular(f"{att_prefix}.{fk} ", arr, fields, out, depth)
419+
_encode_tabular(f"{att_prefix}.{fk} ", arr, fields, out, depth, opts)
374420
else:
375-
_encode_expanded(f"{att_prefix}.{fk} ", arr, out, depth)
421+
_encode_expanded(f"{att_prefix}.{fk} ", arr, out, depth, opts)
376422

377423

378424
def _encode_attachment_array_shared(
379-
att_prefix: str, fk: str, arr: list, out: list[str], depth: int, shared_fields: list[str]
425+
att_prefix: str, fk: str, arr: list, out: list[str], depth: int, shared_fields: list[str], opts: GenericOptions | None = None
380426
) -> None:
381427
if not arr:
382428
out.append(f"{att_prefix}.{fk} [0]")
@@ -403,25 +449,29 @@ def _encode_attachment_array_shared(
403449
out.append(f"{prefix}{'|'.join(cells)}")
404450
else:
405451
# Fields don't match: fall back to full encoding.
406-
_encode_attachment_array(att_prefix, fk, arr, out, depth)
452+
_encode_attachment_array(att_prefix, fk, arr, out, depth, opts)
407453

408454

409-
def _encode_expanded(header_prefix: str, arr: list, out: list[str], depth: int) -> None:
455+
def _encode_expanded(header_prefix: str, arr: list, out: list[str], depth: int, opts: GenericOptions | None = None) -> None:
456+
if opts is None:
457+
opts = GenericOptions()
410458
prefix = _indent(depth)
411459
out.append(f"{header_prefix}[{len(arr)}]")
412460
for i, item in enumerate(arr):
413461
if isinstance(item, dict):
414462
out.append(f"{prefix}@{i} {{}}")
415-
_encode_object(item, out, depth + 1)
463+
_encode_object(item, out, depth + 1, opts)
416464
elif isinstance(item, list):
417-
_encode_expanded_array_item(prefix, i, item, out, depth)
465+
_encode_expanded_array_item(prefix, i, item, out, depth, opts)
418466
else:
419467
out.append(f"{prefix}@{i} ={format_scalar(item)}")
420468

421469

422470
def _encode_expanded_array_item(
423-
prefix: str, idx: int, arr: list, out: list[str], depth: int
471+
prefix: str, idx: int, arr: list, out: list[str], depth: int, opts: GenericOptions | None = None
424472
) -> None:
473+
if opts is None:
474+
opts = GenericOptions()
425475
if not arr:
426476
out.append(f"{prefix}@{idx} [0]")
427477
elif _all_primitives(arr):
@@ -430,9 +480,9 @@ def _encode_expanded_array_item(
430480
else:
431481
fields = _tabular_fields(arr)
432482
if fields is not None:
433-
_encode_tabular(f"{prefix}@{idx} ", arr, fields, out, depth + 1)
483+
_encode_tabular(f"{prefix}@{idx} ", arr, fields, out, depth + 1, opts)
434484
else:
435-
_encode_expanded(f"{prefix}@{idx} ", arr, out, depth + 1)
485+
_encode_expanded(f"{prefix}@{idx} ", arr, out, depth + 1, opts)
436486

437487

438488
def _all_primitives(arr: list) -> bool:

tests/test_conformance_v2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,9 @@ def test_conformance(rel_path, data):
9898
), f"decode mismatch:\n got: {got}\n exp: {data['expected']}"
9999

100100
elif op == "roundtrip":
101-
# Encode, verify GCF output matches expected, then decode and verify round-trip.
101+
# Encode, verify GCF output matches expected (if provided), then decode and verify round-trip.
102102
got = encode_generic(data["input"])
103-
if isinstance(data["expected"], str):
103+
if "expected" in data and isinstance(data["expected"], str):
104104
assert got == data["expected"], f"encode mismatch:\n got: {got!r}\n exp: {data['expected']!r}"
105105
decoded = decode_generic(got)
106106
assert _structural_equal(

0 commit comments

Comments
 (0)