No public description

Rayan Dasoriya · copybara-github · commit 1f9e93993cb8 · 2026-03-03T18:26:58.000-08:00
MG_DOCKER_CODES_PIPER_ORIGIN_REV_ID: 878215121
diff --git a/notebooks/community/model_garden/docker_source_codes/model_oss/notebook_util/dataset_validation_util.py b/notebooks/community/model_garden/docker_source_codes/model_oss/notebook_util/dataset_validation_util.py
@@ -8,7 +8,6 @@
 import multiprocessing
 import os
 import subprocess
-import sys
 from typing import Any, Union
 from absl import logging
 import accelerate
@@ -552,51 +551,32 @@ def drop_long_sequences(
     input_column: str,
     max_sequence_length: int,
     tokenizer: transformers.PreTrainedTokenizer,
-    dataset_dropped_threshold: float,
     is_train: bool,
 ) -> tuple[Any, Any, int]:
-  """Returns the dataset by removing examples that are longer than max_seq_length.
+  """Drops examples longer than max_seq_length from the dataset.
 
   Args:
     dataset: The dataset to filter.
     dataset_with_template: The dataset with template to filter.
     input_column: The input column in the dataset to be used.
     max_sequence_length: The maximum sequence length.
     tokenizer: The tokenizer.
-    dataset_dropped_threshold: The threshold for the number of samples dropped
-      from the dataset.
     is_train: Whether the dataset is for training.
 
   Returns:
     A tuple of (filtered_dataset, filtered_dataset_with_template,
     dropped_samples).
   """
+
   context_name = f"the {'train' if is_train else 'eval'} dataset"
-  indices_to_keep, original_length, dropped_samples = (
-      _get_indices_for_valid_length(
-          dataset_with_template,
-          input_column,
-          max_sequence_length,
-          tokenizer,
-          context_name,
-      )
+  indices_to_keep, _, dropped_samples = _get_indices_for_valid_length(
+      dataset_with_template,
+      input_column,
+      max_sequence_length,
+      tokenizer,
+      context_name,
   )
 
-  if (
-      original_length > 0
-      and dropped_samples / original_length * 100 > dataset_dropped_threshold
-  ):
-    logging.error(
-        "More than %f%% of the samples were dropped from {%s} after"
-        " filtering for max_sequence_length=%d. Please check your dataset.",
-        dataset_dropped_threshold,
-        context_name,
-        max_sequence_length,
-    )
-    
-    # handling library when available.
-    sys.exit(1)
-
   filtered_dataset = dataset.select(indices_to_keep)
   filtered_dataset_with_template = dataset_with_template.select(indices_to_keep)
   return filtered_dataset, filtered_dataset_with_template, dropped_samples
diff --git a/notebooks/community/model_garden/docker_source_codes/notebook_util/dataset_validation_util.py b/notebooks/community/model_garden/docker_source_codes/notebook_util/dataset_validation_util.py
@@ -8,7 +8,6 @@
 import multiprocessing
 import os
 import subprocess
-import sys
 from typing import Any, Union
 from absl import logging
 import accelerate
@@ -552,51 +551,32 @@ def drop_long_sequences(
     input_column: str,
     max_sequence_length: int,
     tokenizer: transformers.PreTrainedTokenizer,
-    dataset_dropped_threshold: float,
     is_train: bool,
 ) -> tuple[Any, Any, int]:
-  """Returns the dataset by removing examples that are longer than max_seq_length.
+  """Drops examples longer than max_seq_length from the dataset.
 
   Args:
     dataset: The dataset to filter.
     dataset_with_template: The dataset with template to filter.
     input_column: The input column in the dataset to be used.
     max_sequence_length: The maximum sequence length.
     tokenizer: The tokenizer.
-    dataset_dropped_threshold: The threshold for the number of samples dropped
-      from the dataset.
     is_train: Whether the dataset is for training.
 
   Returns:
     A tuple of (filtered_dataset, filtered_dataset_with_template,
     dropped_samples).
   """
+
   context_name = f"the {'train' if is_train else 'eval'} dataset"
-  indices_to_keep, original_length, dropped_samples = (
-      _get_indices_for_valid_length(
-          dataset_with_template,
-          input_column,
-          max_sequence_length,
-          tokenizer,
-          context_name,
-      )
+  indices_to_keep, _, dropped_samples = _get_indices_for_valid_length(
+      dataset_with_template,
+      input_column,
+      max_sequence_length,
+      tokenizer,
+      context_name,
   )
 
-  if (
-      original_length > 0
-      and dropped_samples / original_length * 100 > dataset_dropped_threshold
-  ):
-    logging.error(
-        "More than %f%% of the samples were dropped from {%s} after"
-        " filtering for max_sequence_length=%d. Please check your dataset.",
-        dataset_dropped_threshold,
-        context_name,
-        max_sequence_length,
-    )
-    
-    # handling library when available.
-    sys.exit(1)
-
   filtered_dataset = dataset.select(indices_to_keep)
   filtered_dataset_with_template = dataset_with_template.select(indices_to_keep)
   return filtered_dataset, filtered_dataset_with_template, dropped_samples