@@ -29,12 +29,12 @@ void perform_recursive_chunking(
2929 const char * start_ptr = text_view.data ();
3030 size_t start_pos = start_ptr - original_text.data ();
3131 size_t end_pos = start_pos + text_view.length ();
32-
32+
3333 TextChunk chunk;
3434 chunk.text = original_text.substr (start_pos, end_pos - start_pos);
3535 chunk.start_position = start_pos;
3636 chunk.end_position = end_pos;
37-
37+
3838 size_t first = chunk.text .find_first_not_of (" \t\n\r " );
3939 size_t last = chunk.text .find_last_not_of (" \t\n\r " );
4040 if (first != std::string::npos && last != std::string::npos) {
@@ -91,12 +91,12 @@ void perform_recursive_chunking(
9191 const char * end_ptr = current_batch.back ().data () + current_batch.back ().length ();
9292 size_t start_pos = start_ptr - original_text.data ();
9393 size_t end_pos = end_ptr - original_text.data ();
94-
94+
9595 TextChunk chunk;
9696 chunk.text = original_text.substr (start_pos, end_pos - start_pos);
9797 chunk.start_position = start_pos;
9898 chunk.end_position = end_pos;
99-
99+
100100 size_t first = chunk.text .find_first_not_of (" \t\n\r " );
101101 size_t last = chunk.text .find_last_not_of (" \t\n\r " );
102102 if (first != std::string::npos && last != std::string::npos) {
@@ -115,12 +115,12 @@ void perform_recursive_chunking(
115115
116116 for (size_t i = 0 ; i < splits.size (); ++i) {
117117 auto split = splits[i];
118-
118+
119119 if (split.length () > chunk_size_chars) {
120120 emit_chunk ();
121121 current_batch.clear ();
122122 current_length = 0 ;
123-
123+
124124 if (!next_separators.empty ()) {
125125 perform_recursive_chunking (split, original_text, next_separators, chunk_size_chars, chunk_overlap_chars, output_chunks, chunk_index);
126126 } else {
@@ -147,16 +147,17 @@ void perform_recursive_chunking(
147147 current_batch.erase (current_batch.begin ());
148148 }
149149 }
150-
150+
151151 current_batch.push_back (split);
152152 current_length += split.length ();
153153 }
154-
154+
155155 if (!current_batch.empty ()) {
156156 emit_chunk ();
157157 }
158158}
159159
160+ } // anonymous namespace
160161
161162DocumentChunker::DocumentChunker (const ChunkerConfig& config) : config_(config) {}
162163
@@ -167,10 +168,10 @@ std::vector<TextChunk> DocumentChunker::chunk_document(const std::string& text)
167168
168169 std::vector<TextChunk> chunks;
169170 size_t chunk_index = 0 ;
170-
171+
171172 size_t chunk_size_chars = config_.chunk_size * config_.chars_per_token ;
172173 size_t overlap_chars = config_.chunk_overlap * config_.chars_per_token ;
173-
174+
174175 // Hierarchy of separators for standard English text
175176 std::vector<std::string> separators = {" \n\n " , " \n " , " . " , " ? " , " ! " , " ; " , " , " , " " , " " };
176177
@@ -213,7 +214,7 @@ std::vector<size_t> DocumentChunker::find_sentence_boundaries(const std::string&
213214
214215 for (size_t i = 0 ; i < text.length (); ++i) {
215216 char c = text[i];
216-
217+
217218 // Check for sentence endings
218219 if (c == ' .' || c == ' !' || c == ' ?' || c == ' \n ' ) {
219220 // Look ahead for whitespace
@@ -228,4 +229,4 @@ std::vector<size_t> DocumentChunker::find_sentence_boundaries(const std::string&
228229}
229230
230231} // namespace rag
231- } // namespace runanywhere
232+ } // namespace runanywhere
0 commit comments