toon-python/src/toon_format/_scanner.py at 805a1b81859bbea6bb28219e1fbf5f027d2299bf · toon-format/toon-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# Copyright (c) 2025 TOON Format Organization
# SPDX-License-Identifier: MIT
"""Scanner for parsing TOON input into lines with depth information.

This module implements the first stage of the TOON decoding pipeline:
scanning the input text and converting it into structured line objects
with depth and indentation metadata. Handles strict and lenient parsing modes.
"""

from dataclasses import dataclass
from typing import List, Optional, Tuple

from .constants import SPACE, TAB


@dataclass
class ParsedLine:
    """A parsed line with metadata.

    Attributes:
        raw: The original raw line content
        depth: The indentation depth (number of indent levels)
        indent: The number of leading spaces
        content: The line content after removing indentation
        line_num: The 1-based line number in the source
    """

    raw: str
    depth: int
    indent: int
    content: str
    line_num: int

    @property
    def is_blank(self) -> bool:
        """Check if this line is blank (only whitespace).

        Returns:
            True if the line contains only whitespace
        """
        return not self.content.strip()


@dataclass
class BlankLineInfo:
    """Information about a blank line.

    Attributes:
        line_num: The 1-based line number
        indent: The number of leading spaces
        depth: The computed indentation depth
    """

    line_num: int
    indent: int
    depth: int


class LineCursor:
    """Iterator-like class for traversing parsed lines.

    Provides methods to peek at the current line, advance to the next line,
    and check for lines at specific depths. This abstraction makes the decoder
    logic cleaner and easier to test.
    """

    def __init__(
        self,
        lines: List[ParsedLine],
        blank_lines: Optional[List[BlankLineInfo]] = None,
    ) -> None:
        """Initialize a line cursor.

        Args:
            lines: The parsed lines to traverse
            blank_lines: Optional list of blank line information
        """
        self._lines = lines
        self._index = 0
        self._blank_lines = blank_lines or []

    def get_blank_lines(self) -> List[BlankLineInfo]:
        """Get the list of blank lines."""
        return self._blank_lines

    def peek(self) -> Optional[ParsedLine]:
        """Peek at the current line without advancing.

        Returns:
            The current line, or None if at end
        """
        if self._index >= len(self._lines):
            return None
        return self._lines[self._index]

    def next(self) -> Optional[ParsedLine]:
        """Get the current line and advance.

        Returns:
            The current line, or None if at end
        """
        if self._index >= len(self._lines):
            return None
        line = self._lines[self._index]
        self._index += 1
        return line

    def current(self) -> Optional[ParsedLine]:
        """Get the most recently consumed line.

        Returns:
            The previous line, or None if no line has been consumed
        """
        if self._index > 0:
            return self._lines[self._index - 1]
        return None

    def advance(self) -> None:
        """Advance to the next line."""
        self._index += 1

    def at_end(self) -> bool:
        """Check if cursor is at the end of lines.

        Returns:
            True if at end
        """
        return self._index >= len(self._lines)

    @property
    def length(self) -> int:
        """Get the total number of lines."""
        return len(self._lines)

    def peek_at_depth(self, target_depth: int) -> Optional[ParsedLine]:
        """Peek at the next line at a specific depth.

        Args:
            target_depth: The target depth

        Returns:
            The line if it matches the depth, None otherwise
        """
        line = self.peek()
        if not line or line.depth < target_depth:
            return None
        if line.depth == target_depth:
            return line
        return None

    def has_more_at_depth(self, target_depth: int) -> bool:
        """Check if there are more lines at a specific depth.

        Args:
            target_depth: The target depth

        Returns:
            True if there are more lines at the target depth
        """
        return self.peek_at_depth(target_depth) is not None

    def skip_deeper_than(self, depth: int) -> None:
        """Skip all lines that are deeper than the given depth.

        This is useful for skipping over nested structures after processing them.

        Args:
            depth: The reference depth. All lines with depth > this will be skipped.

        Example:
            >>> cursor.skip_deeper_than(1)  # Skip all lines at depth 2, 3, 4, etc.
        """
        line = self.peek()
        while line and line.depth > depth:
            self.advance()
            line = self.peek()


def to_parsed_lines(
    source: str,
    indent_size: int,
    strict: bool,
) -> Tuple[List[ParsedLine], List[BlankLineInfo]]:
    """Convert source string to parsed lines with depth information.

    Per Section 12 of the TOON specification for indentation handling.
    This is the entry point for the scanning stage of the decoder pipeline.

    Args:
        source: The source string to parse
        indent_size: The number of spaces per indentation level
        strict: Whether to enforce strict indentation validation

    Returns:
        A tuple of (parsed_lines, blank_lines)

    Raises:
        SyntaxError: If strict mode validation fails (tabs in indentation, invalid spacing)

    Examples:
        >>> lines, blanks = to_parsed_lines("name: Alice\\n  age: 30", 2, True)
        >>> lines[0].content
        'name: Alice'
        >>> lines[1].depth
        1
    """
    if not source.strip():
        return [], []

    # Normalize Windows CRLF line endings to LF
    # This prevents stray \r characters from appearing in content
    source = source.replace("\r\n", "\n")

    # Strip any remaining standalone \r characters (old Mac format)
    source = source.replace("\r", "\n")

    lines = source.split("\n")
    parsed: List[ParsedLine] = []
    blank_lines: List[BlankLineInfo] = []

    for i, raw in enumerate(lines):
        line_num = i + 1
        indent = 0
        while indent < len(raw) and raw[indent] == SPACE:
            indent += 1

        content = raw[indent:]

        # Compute depth for both blank and non-blank lines
        depth = _compute_depth_from_indent(indent, indent_size)

        # Track blank lines (but still include them in parsed list for validation)
        is_blank = not content.strip()
        if is_blank:
            blank_lines.append(
                BlankLineInfo(
                    line_num=line_num,
                    indent=indent,
                    depth=depth,
                )
            )
            # Blank lines are not validated for indentation
            # But we still add them to parsed list for array blank line detection

        # Strict mode validation (skip for blank lines)
        if strict and not is_blank:
            # Find the full leading whitespace region (spaces and tabs)
            ws_end = 0
            while ws_end < len(raw) and (raw[ws_end] == SPACE or raw[ws_end] == TAB):
                ws_end += 1

            # Check for tabs in leading whitespace (before actual content)
            if TAB in raw[:ws_end]:
                raise SyntaxError(
                    f"Line {line_num}: Tabs not allowed in indentation in strict mode"
                )

            # Check for exact multiples of indent_size
            if indent > 0 and indent % indent_size != 0:
                raise SyntaxError(
                    f"Line {line_num}: Indent must be exact multiple of {indent_size}, "
                    f"but found {indent} spaces"
                )

        parsed.append(
            ParsedLine(
                raw=raw,
                indent=indent,
                content=content,
                depth=depth,
                line_num=line_num,
            )
        )

    return parsed, blank_lines


def _compute_depth_from_indent(indent_spaces: int, indent_size: int) -> int:
    """Compute depth from indentation spaces.

    Args:
        indent_spaces: Number of leading spaces
        indent_size: Number of spaces per indentation level

    Returns:
        The computed depth

    Examples:
        >>> _compute_depth_from_indent(0, 2)
        0
        >>> _compute_depth_from_indent(4, 2)
        2
        >>> _compute_depth_from_indent(3, 2)  # Lenient mode
        1
    """
    return indent_spaces // indent_size