Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 7 additions & 13 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,13 @@
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit"
},
}
},
"files.associations": {
"*.SFM": "usfm",
"*.SFM": "usfm"
},
"black-formatter.path": [
"poetry",
"run",
"black"
],
"isort.args": [
"--profile",
"black"
],
"black-formatter.path": ["poetry", "run", "black"],
"isort.args": ["--profile", "black"],
"cSpell.words": [
"CLEARML",
"DYNACONF",
Expand All @@ -34,5 +27,6 @@
"Usfm",
"venv"
],
"python-envs.defaultEnvManager": "ms-python.python:system",
}
"python-envs.defaultEnvManager": "ms-python.python:poetry",
"python-envs.defaultPackageManager": "ms-python.python:poetry"
}
2 changes: 2 additions & 0 deletions machine/translation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from .symmetrized_word_alignment_model import SymmetrizedWordAlignmentModel
from .symmetrized_word_alignment_model_trainer import SymmetrizedWordAlignmentModelTrainer
from .trainer import Trainer, TrainStats
from .transductive_word_alignment_model import TransductiveWordAlignmentModel
from .translation_constants import MAX_SEGMENT_LENGTH
from .translation_engine import TranslationEngine
from .translation_model import TranslationModel
Expand Down Expand Up @@ -69,6 +70,7 @@
"SymmetrizedWordAlignmentModelTrainer",
"Trainer",
"TrainStats",
"TransductiveWordAlignmentModel",
"translate_corpus",
"TranslationEngine",
"TranslationModel",
Expand Down
38 changes: 38 additions & 0 deletions machine/translation/corpus_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ..corpora.parallel_text_row import ParallelTextRow
from ..utils.progress_status import ProgressStatus
from .symmetrization_heuristic import SymmetrizationHeuristic
from .transductive_word_alignment_model import TransductiveWordAlignmentModel
from .translation_engine import TranslationEngine
from .word_aligner import WordAligner
from .word_alignment_matrix import WordAlignmentMatrix
Expand All @@ -23,11 +24,16 @@ def word_align_corpus(

model = create_thot_symmetrized_word_alignment_model(aligner)
model.heuristic = symmetrization_heuristic
# Retain the alignments computed during training so that the corpus can be aligned
# without a separate, potentially expensive, inference pass.
model.emit_training_alignments = True
with model.create_trainer(corpus) as trainer:
trainer.train(progress)
trainer.save()
aligner = model

if isinstance(aligner, TransductiveWordAlignmentModel):
return _TransductiveWordAlignParallelTextCorpus(corpus, aligner)
return _WordAlignParallelTextCorpus(corpus, aligner, batch_size)


Expand Down Expand Up @@ -67,6 +73,38 @@ def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[Paral
yield row


class _TransductiveWordAlignParallelTextCorpus(ParallelTextCorpus):
def __init__(self, corpus: ParallelTextCorpus, model: TransductiveWordAlignmentModel) -> None:
self._corpus = corpus
self._model = model

def is_source_tokenized(self) -> bool:
return self._corpus.is_source_tokenized

def is_target_tokenized(self) -> bool:
return self._corpus.is_target_tokenized

def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:
# The training alignments are keyed by the order in which the sentence pairs were added
# during training, so the full corpus must be iterated to keep the index in sync; rows that
# are not in the requested texts are skipped rather than filtered out of the enumeration.
text_id_set = None if text_ids is None else set(text_ids)
with self._corpus.get_rows() as rows:
for index, row in enumerate(rows):
if text_id_set is not None and row.text_id not in text_id_set:
continue
alignment = self._model.get_training_alignment(index)
known_alignment = WordAlignmentMatrix.from_parallel_text_row(row)
if known_alignment is not None:
known_alignment.priority_symmetrize_with(alignment)
alignment = known_alignment
word_pairs = alignment.to_aligned_word_pairs()
if isinstance(self._model, WordAlignmentModel):
self._model.compute_aligned_word_pair_scores(row.source_segment, row.target_segment, word_pairs)
row.aligned_word_pairs = word_pairs
yield row


class _TranslateParallelTextCorpus(ParallelTextCorpus):
def __init__(self, corpus: ParallelTextCorpus, translation_engine: TranslationEngine, batch_size: int) -> None:
self._corpus = corpus
Expand Down
26 changes: 22 additions & 4 deletions machine/translation/thot/thot_symmetrized_word_alignment_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@
from ..symmetrized_word_alignment_model import SymmetrizedWordAlignmentModel
from ..symmetrized_word_alignment_model_trainer import SymmetrizedWordAlignmentModelTrainer
from ..trainer import Trainer
from ..transductive_word_alignment_model import TransductiveWordAlignmentModel
from ..word_alignment_matrix import WordAlignmentMatrix
from .thot_utils import batch
from .thot_word_alignment_model import ThotWordAlignmentModel

_MAX_BATCH_SIZE = 10240


class ThotSymmetrizedWordAlignmentModel(SymmetrizedWordAlignmentModel):
class ThotSymmetrizedWordAlignmentModel(SymmetrizedWordAlignmentModel, TransductiveWordAlignmentModel):
def __init__(
self,
direct_word_alignment_model: ThotWordAlignmentModel,
Expand Down Expand Up @@ -56,17 +57,34 @@ def align_batch(self, segments: Sequence[Sequence[Sequence[str]]]) -> Sequence[W
results.append(WordAlignmentMatrix(matrix.to_numpy()))
return results

@property
def emit_training_alignments(self) -> bool:
return self.direct_word_alignment_model.emit_training_alignments

@emit_training_alignments.setter
def emit_training_alignments(self, value: bool) -> None:
self.direct_word_alignment_model.emit_training_alignments = value
self.inverse_word_alignment_model.emit_training_alignments = value

@property
def training_alignment_count(self) -> int:
return self._aligner.num_sentence_pairs

def get_training_alignment(self, n: int) -> WordAlignmentMatrix:
_, matrix = self._aligner.get_training_alignment(n)
return WordAlignmentMatrix(matrix.to_numpy())

def create_trainer(self, corpus: ParallelTextCorpus) -> Trainer:
direct_trainer = self._direct_word_alignment_model.create_trainer(corpus)
inverse_trainer = self._inverse_word_alignment_model.create_trainer(corpus.invert())
direct_trainer = self.direct_word_alignment_model.create_trainer(corpus)
inverse_trainer = self.inverse_word_alignment_model.create_trainer(corpus.invert())

return _Trainer(self, direct_trainer, inverse_trainer)

def __enter__(self) -> ThotSymmetrizedWordAlignmentModel:
return self

def _reset_aligner(self) -> None:
self._aligner = ta.SymmetrizedAligner(
self._aligner = ta.SymmetrizedAlignmentModel(
self.direct_word_alignment_model.thot_model, self.inverse_word_alignment_model.thot_model
)
self._aligner.heuristic = _convert_heuristic(self._heuristic)
Expand Down
20 changes: 18 additions & 2 deletions machine/translation/thot/thot_word_alignment_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ...corpora.parallel_text_corpus import ParallelTextCorpus
from ...utils.typeshed import StrPath
from ..ibm1_word_alignment_model import Ibm1WordAlignmentModel
from ..transductive_word_alignment_model import TransductiveWordAlignmentModel
from ..word_alignment_matrix import WordAlignmentMatrix
from ..word_vocabulary import WordVocabulary
from .thot_utils import batch, escape_token, escape_tokens, unescape_token
Expand All @@ -21,7 +22,7 @@
_MAX_BATCH_SIZE = 10240


class ThotWordAlignmentModel(Ibm1WordAlignmentModel):
class ThotWordAlignmentModel(Ibm1WordAlignmentModel, TransductiveWordAlignmentModel):
def __init__(self, prefix_filename: Optional[StrPath] = None, create_new: bool = False) -> None:
self._set_model(self._create_model())
if prefix_filename is not None:
Expand All @@ -33,6 +34,7 @@ def __init__(self, prefix_filename: Optional[StrPath] = None, create_new: bool =
else:
self._prefix_filename = None
self.parameters = ThotWordAlignmentParameters()
self.emit_training_alignments = False

@property
def source_words(self) -> WordVocabulary:
Expand Down Expand Up @@ -94,6 +96,14 @@ def align_batch(self, segments: Sequence[Sequence[Sequence[str]]]) -> Sequence[W
results.append(WordAlignmentMatrix(matrix.to_numpy()))
return results

@property
def training_alignment_count(self) -> int:
return self._model.num_sentence_pairs

def get_training_alignment(self, n: int) -> WordAlignmentMatrix:
_, matrix = self._model.get_training_alignment(n)
return WordAlignmentMatrix(matrix.to_numpy())

def get_translation_score(
self, source_word: Optional[Union[str, int]], target_word: Optional[Union[str, int]]
) -> float:
Expand Down Expand Up @@ -199,7 +209,13 @@ class _Trainer(ThotWordAlignmentModelTrainer):
def __init__(
self, model: ThotWordAlignmentModel, corpus: ParallelTextCorpus, prefix_filename: Optional[StrPath]
) -> None:
super().__init__(model.type, corpus, prefix_filename, model.parameters)
super().__init__(
model.type,
corpus,
prefix_filename,
model.parameters,
emit_training_alignments=model.emit_training_alignments,
)
self._machine_model = model

def save(self) -> None:
Expand Down
11 changes: 11 additions & 0 deletions machine/translation/thot/thot_word_alignment_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(
source_tokenizer: Tokenizer[str, int, str] = WHITESPACE_TOKENIZER,
target_tokenizer: Tokenizer[str, int, str] = WHITESPACE_TOKENIZER,
max_corpus_count: int = sys.maxsize,
emit_training_alignments: bool = False,
) -> None: ...

@overload
Expand All @@ -40,6 +41,8 @@ def __init__(
parameters: ThotWordAlignmentParameters = ThotWordAlignmentParameters(),
source_tokenizer: Tokenizer[str, int, str] = WHITESPACE_TOKENIZER,
target_tokenizer: Tokenizer[str, int, str] = WHITESPACE_TOKENIZER,
max_corpus_count: int = sys.maxsize,
emit_training_alignments: bool = False,
) -> None: ...

def __init__(
Expand All @@ -51,6 +54,7 @@ def __init__(
source_tokenizer: Tokenizer[str, int, str] = WHITESPACE_TOKENIZER,
target_tokenizer: Tokenizer[str, int, str] = WHITESPACE_TOKENIZER,
max_corpus_count: int = sys.maxsize,
emit_training_alignments: bool = False,
) -> None:
if isinstance(corpus, tuple) and max_corpus_count != sys.maxsize:
raise ValueError("max_corpus_count cannot be set when corpus filenames are provided.")
Expand All @@ -60,6 +64,7 @@ def __init__(
self._max_corpus_count = max_corpus_count
self.source_tokenizer = source_tokenizer
self.target_tokenizer = target_tokenizer
self.emit_training_alignments = emit_training_alignments
self._stats = TrainStats()

if isinstance(model_type, str):
Expand Down Expand Up @@ -216,6 +221,12 @@ def report() -> None:
if check_canceled is not None:
check_canceled()

if self.emit_training_alignments:
# Retain the alignments computed during training so that they can be returned without a
# separate inference pass. Only the final (most refined) model's alignments are needed,
# since that is the model used for inference.
self._model.emit_training_alignments = True

trained_segment_count = 0
for model, iteration_count in self._models:
if iteration_count == 0 and not self._is_eflomal:
Expand Down
13 changes: 13 additions & 0 deletions machine/translation/transductive_word_alignment_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from abc import ABC, abstractmethod

from .word_alignment_matrix import WordAlignmentMatrix


class TransductiveWordAlignmentModel(ABC):

@property
@abstractmethod
def training_alignment_count(self) -> int: ...

@abstractmethod
def get_training_alignment(self, n: int) -> WordAlignmentMatrix: ...
Loading
Loading