toon-python/src/toon_format/utils.py at e95a1ff07d2e41359bb8fc0a2e48b952906591d2 · toon-format/toon-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# Copyright (c) 2025 TOON Format Organization
# SPDX-License-Identifier: MIT
"""Utilities for TOON format.

This module provides utilities for:
- Token analysis and efficiency comparison between JSON and TOON formats
- JSON integration and null value handling
- Estimating API costs and optimizing prompt sizes

Functions:
    count_tokens: Count tokens in a text string
    estimate_savings: Compare JSON vs TOON token counts
    compare_formats: Generate formatted comparison table
    loads: Parse JSON string into Python objects (alias for json.loads)
    encode_json: Encode a JSON string directly into TOON format

Requirements:
    tiktoken: Install with `uv add tiktoken` or `uv add toon_format[benchmark]`
"""

import functools
import json
from typing import Any, Dict

from .encoder import encode

__all__ = ["count_tokens", "estimate_savings", "compare_formats", "encode_json", "loads"]


_TIKTOKEN_MISSING_MSG = (
    "tiktoken is required for token counting. "
    "Install with: uv add tiktoken or uv add toon_format[benchmark]"
)


def loads(json_string: str) -> Any:
    """Parse JSON string into Python objects.

    This is an alias for `json.loads()` provided for convenience and to ensure
    a TOON-friendly integration flow where JSON 'null' is correctly converted
    to Python 'None'.

    Args:
        json_string: The JSON string to parse.

    Returns:
        Any: Parsed Python data structure.
    """
    return json.loads(json_string)


def encode_json(json_string: str) -> str:
    """Encode a JSON string directly into TOON format.

    Parses the JSON string (converting 'null' to 'None' automatically)
    and then encodes the resulting Python object into TOON.

    Args:
        json_string: The JSON string to encode.

    Returns:
        str: TOON-formatted string.

    Example:
        >>> import toon_format
        >>> toon_format.encode_json('{"abc": null}')
        'abc: null'
    """
    data = loads(json_string)
    return encode(data)


def _require_tiktoken():
    try:
        import tiktoken  # type: ignore[import-not-found]
    except ImportError as exc:  # pragma: no cover - exercised via count_tokens
        raise RuntimeError(_TIKTOKEN_MISSING_MSG) from exc
    return tiktoken


@functools.lru_cache(maxsize=1)
def _get_tokenizer():
    """Get cached tiktoken tokenizer for o200k_base encoding.

    Returns:
        tiktoken.Encoding: The o200k_base tokenizer (gpt5/gpt5-mini).

    Raises:
        RuntimeError: If tiktoken is not installed.
    """
    tiktoken = _require_tiktoken()
    return tiktoken.get_encoding("o200k_base")


def count_tokens(text: str, encoding: str = "o200k_base") -> int:
    """Count tokens in a text string using tiktoken.

    Args:
        text: The string to tokenize.
        encoding: Tokenizer encoding name (default: 'o200k_base' for gpt5/gpt5-mini).
                  Other options include 'cl100k_base' (GPT-3.5), 'p50k_base' (older models).

    Returns:
        int: The number of tokens in the text.

    Example:
        >>> import toon_format
        >>> text = "Hello, world!"
        >>> toon_format.count_tokens(text)
        4

    Note:
        Requires tiktoken to be installed: uv add tiktoken or uv add toon_format[benchmark]
    """
    if encoding == "o200k_base":
        enc = _get_tokenizer()
    else:
        tiktoken = _require_tiktoken()
        enc = tiktoken.get_encoding(encoding)

    return len(enc.encode(text))


def estimate_savings(data: Any, encoding: str = "o200k_base") -> Dict[str, Any]:
    """Compare token counts between JSON and TOON formats.

    Args:
        data: Python dict or list to compare.
        encoding: Tokenizer encoding name (default: 'o200k_base').

    Returns:
        dict: Dictionary containing:
            - json_tokens (int): Token count for JSON format
            - toon_tokens (int): Token count for TOON format
            - savings (int): Absolute token savings (json_tokens - toon_tokens)
            - savings_percent (float): Percentage savings

    Example:
        >>> import toon_format
        >>> data = {"employees": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]}
        >>> result = toon_format.estimate_savings(data)
        >>> print(f"Savings: {result['savings_percent']:.1f}%")
        Savings: 42.3%

    Note:
        Significant savings are typically achieved with structured data,
        especially arrays of uniform objects (tabular data).
    """
    # Encode as JSON
    json_str = json.dumps(data, indent=2, ensure_ascii=False)
    json_tokens = count_tokens(json_str, encoding)

    # Encode as TOON
    toon_str = encode(data)
    toon_tokens = count_tokens(toon_str, encoding)

    # Calculate savings
    savings = max(0, json_tokens - toon_tokens)
    savings_percent = (savings / json_tokens * 100.0) if json_tokens > 0 else 0.0

    return {
        "json_tokens": json_tokens,
        "toon_tokens": toon_tokens,
        "savings": savings,
        "savings_percent": savings_percent,
    }


def compare_formats(data: Any, encoding: str = "o200k_base") -> str:
    """Generate a formatted comparison table showing JSON vs TOON metrics.

    Args:
        data: Python dict or list to compare.
        encoding: Tokenizer encoding name (default: 'o200k_base').

    Returns:
        str: Formatted table as multi-line string showing token counts,
             character sizes, and savings percentage.

    Example:
        >>> import toon_format
        >>> data = {"users": [{"id": 1, "name": "Alice"}]}
        >>> print(toon_format.compare_formats(data))
        Format Comparison
        ────────────────────────────────────────────────
        Format      Tokens    Size (chars)
        JSON         1,234         5,678
        TOON           789         3,456
        ────────────────────────────────────────────────
        Savings: 445 tokens (36.1%)

    Note:
        This is useful for quick visual comparison during development.
    """
    # Get token metrics
    metrics = estimate_savings(data, encoding)

    # Encode both formats to get character counts
    json_str = json.dumps(data, indent=2, ensure_ascii=False)
    toon_str = encode(data)

    json_chars = len(json_str)
    toon_chars = len(toon_str)

    # Build formatted table
    separator = "─" * 48
    lines = [
        "Format Comparison",
        separator,
        "Format      Tokens    Size (chars)",
        f"JSON      {metrics['json_tokens']:>7,}    {json_chars:>11,}",
        f"TOON      {metrics['toon_tokens']:>7,}    {toon_chars:>11,}",
        separator,
        f"Savings: {metrics['savings']:,} tokens ({metrics['savings_percent']:.1f}%)",
    ]

    return "\n".join(lines)