Skip to content

Commit 775571e

Browse files
authored
[APIView] Sanitize text tokens of newlines on ingestion (#15081)
* [APIView] Sanitize text tokens of newlines on ingestion * respond to copilot * Respond to feedback
1 parent 7569041 commit 775571e

File tree

2 files changed

+151
-0
lines changed

2 files changed

+151
-0
lines changed

src/dotnet/APIView/APIViewUnitTests/CodeFileManagerTests.cs

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,4 +415,104 @@ public async Task ComputeAPIContentHashAsync_ProducesDifferentHash_WhenApiSurfac
415415
}
416416

417417
#endregion
418+
419+
#region Token Sanitization Tests
420+
421+
[Fact]
422+
public async Task CreateReviewCodeFileModel_SanitizesNewlinesInTreeTokenValues()
423+
{
424+
var codeFile = new CodeFile
425+
{
426+
Language = "Java",
427+
PackageName = "com.azure.storage",
428+
ReviewLines =
429+
[
430+
new ReviewLine
431+
{
432+
LineId = "doc-1",
433+
Tokens =
434+
[
435+
new ReviewToken
436+
{
437+
Value = "\n This package contains clients.\n For details see README.md\n ",
438+
Kind = TokenKind.Text,
439+
IsDocumentation = true
440+
}
441+
]
442+
}
443+
]
444+
};
445+
446+
using var memoryStream = new MemoryStream();
447+
448+
await _codeFileManager.CreateReviewCodeFileModel("api-rev-1", memoryStream, codeFile);
449+
450+
string value = codeFile.ReviewLines[0].Tokens[0].Value;
451+
Assert.Equal(" This package contains clients. For details see README.md ", value);
452+
Assert.DoesNotContain('\n', value);
453+
Assert.DoesNotContain('\r', value);
454+
}
455+
456+
[Fact]
457+
public async Task CreateReviewCodeFileModel_SanitizesOnlyNestedTextTokens()
458+
{
459+
var codeFile = new CodeFile
460+
{
461+
Language = "C#",
462+
ReviewLines =
463+
[
464+
new ReviewLine
465+
{
466+
LineId = "parent",
467+
Tokens = [new ReviewToken("Parent\ntoken", TokenKind.Keyword)],
468+
Children =
469+
[
470+
new ReviewLine
471+
{
472+
LineId = "child",
473+
Tokens = [new ReviewToken("Child\ntoken\nvalue", TokenKind.Text)]
474+
}
475+
]
476+
}
477+
]
478+
};
479+
480+
using var memoryStream = new MemoryStream();
481+
482+
await _codeFileManager.CreateReviewCodeFileModel("api-rev-3", memoryStream, codeFile);
483+
484+
Assert.Equal("Parent\ntoken", codeFile.ReviewLines[0].Tokens[0].Value);
485+
Assert.Equal("Child token value", codeFile.ReviewLines[0].Children[0].Tokens[0].Value);
486+
}
487+
488+
[Fact]
489+
public async Task CreateReviewCodeFileModel_DoesNotModifyTreeTokenWithoutNewlines()
490+
{
491+
var codeFile = new CodeFile
492+
{
493+
Language = "Java",
494+
ReviewLines =
495+
[
496+
new ReviewLine
497+
{
498+
LineId = "tree-1",
499+
Tokens =
500+
[
501+
new ReviewToken(" ", TokenKind.Text),
502+
new ReviewToken("NoNewlines", TokenKind.Text)
503+
]
504+
}
505+
]
506+
};
507+
508+
using var memoryStream = new MemoryStream();
509+
510+
await _codeFileManager.CreateReviewCodeFileModel("api-rev-3a", memoryStream, codeFile);
511+
512+
Assert.Equal(" ", codeFile.ReviewLines[0].Tokens[0].Value);
513+
Assert.Equal("NoNewlines", codeFile.ReviewLines[0].Tokens[1].Value);
514+
}
515+
516+
#endregion
517+
418518
}

src/dotnet/APIView/APIViewWeb/Managers/CodeFileManager.cs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@ public async Task<APICodeFileModel> CreateReviewCodeFileModel(string apiRevision
217217
memoryStream.Position = 0;
218218
await _originalsRepository.UploadOriginalAsync(reviewCodeFileModel.FileId, memoryStream);
219219
}
220+
221+
SanitizeTokenValues(codeFile);
222+
220223
await _codeFileRepository.UpsertCodeFileAsync(apiRevisionId, reviewCodeFileModel.FileId, codeFile);
221224
reviewCodeFileModel.ContentHash = await ComputeAPIContentHashAsync(codeFile);
222225
return reviewCodeFileModel;
@@ -316,5 +319,53 @@ private static void InitializeFromCodeFile(APICodeFileModel file, CodeFile codeF
316319
file.CrossLanguagePackageId = codeFile.CrossLanguageMetadata != null ? codeFile.CrossLanguageMetadata.CrossLanguagePackageId : codeFile.CrossLanguagePackageId;
317320
file.ParserStyle = (codeFile.ReviewLines.Count > 0) ? ParserStyle.Tree : ParserStyle.Flat;
318321
}
322+
323+
/// <summary>
324+
/// Sanitizes tree-style token values in the CodeFile by removing embedded newlines.
325+
/// Legacy flat tokens are intentionally left unchanged.
326+
/// </summary>
327+
private static void SanitizeTokenValues(CodeFile codeFile)
328+
{
329+
// Tree-style (ReviewToken is a class, so mutate in place)
330+
if (codeFile.ReviewLines != null && codeFile.ReviewLines.Count > 0)
331+
{
332+
SanitizeReviewLines(codeFile.ReviewLines);
333+
}
334+
}
335+
336+
/// <summary>
337+
/// Recursively sanitizes token values in ReviewLines and their children.
338+
/// ReviewToken is a reference type, so mutations are applied in place.
339+
/// </summary>
340+
private static void SanitizeReviewLines(List<ReviewLine> lines)
341+
{
342+
foreach (var line in lines)
343+
{
344+
foreach (var token in line.Tokens)
345+
{
346+
if (token.Kind == TokenKind.Text && !string.IsNullOrEmpty(token.Value))
347+
token.Value = NormalizeTokenValue(token.Value);
348+
}
349+
if (line.Children != null && line.Children.Count > 0)
350+
{
351+
SanitizeReviewLines(line.Children);
352+
}
353+
}
354+
}
355+
356+
/// <summary>
357+
/// Normalizes a token value by replacing all newline characters (both \r\n and \n\r)
358+
/// with single spaces, then trimming leading/trailing whitespace.
359+
/// </summary>
360+
private static string NormalizeTokenValue(string value)
361+
{
362+
if (string.IsNullOrEmpty(value))
363+
return value;
364+
365+
return value
366+
.Replace("\r\n", " ")
367+
.Replace('\r', ' ')
368+
.Replace('\n', ' ');
369+
}
319370
}
320371
}

0 commit comments

Comments
 (0)