Encoder-Decoder Architecture¶
In this notebook, we are going to run the .generate
method of huggingface as well as the generation process of generate-sequences
on a sequence-to-sequence pretrained pytorch machine translation model. The architecture we are going to run on is a encoder-decoder architecture, a sequence-sequence-based architecture.
In [1]:
Copied!
import json
import torch
import evaluate
import datasets
from tqdm.auto import tqdm
from transformers import MarianTokenizer, MarianMTModel
from generate_sequences import GreedyGenerator, BeamSearchGenerator
import json
import torch
import evaluate
import datasets
from tqdm.auto import tqdm
from transformers import MarianTokenizer, MarianMTModel
from generate_sequences import GreedyGenerator, BeamSearchGenerator
Load the Model and Dataset¶
In [2]:
Copied!
# load the translation model from transformers
# model_name = "Helsinki-NLP/opus-mt-ar-en"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cuda')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "marefa-nlp/marefa-mt-en-ar"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
bleu_scorer = evaluate.load("sacrebleu")
test_dataset = datasets.load_dataset('iwslt2017','iwslt2017-ar-en', split='test')
# load the translation model from transformers
# model_name = "Helsinki-NLP/opus-mt-ar-en"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cuda')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "marefa-nlp/marefa-mt-en-ar"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
bleu_scorer = evaluate.load("sacrebleu")
test_dataset = datasets.load_dataset('iwslt2017','iwslt2017-ar-en', split='test')
/home/magedsaeed/virtualenvs/generate-sequences/lib/python3.12/site-packages/datasets/load.py:1486: FutureWarning: The repository for iwslt2017 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/iwslt2017 You can avoid this message in future by passing the argument `trust_remote_code=True`. Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. warnings.warn(
In [3]:
Copied!
source_language = 'en'
target_language = 'ar'
source_language = 'en'
target_language = 'ar'
testing on 10 samples only.
In [4]:
Copied!
input_texts = [example[source_language] for example in test_dataset['translation']][-10:]
targets = [example[target_language] for example in test_dataset['translation']][-10:]
len(input_texts), len(targets), input_texts[:5], targets[:5]
input_texts = [example[source_language] for example in test_dataset['translation']][-10:]
targets = [example[target_language] for example in test_dataset['translation']][-10:]
len(input_texts), len(targets), input_texts[:5], targets[:5]
Out[4]:
(10, 10, ["One major consequence of this work is that maybe all of these decades, we've had the whole concept of cybernetic revolt in reverse.", "It's not that machines first become intelligent and then megalomaniacal and try to take over the world.", "It's quite the opposite, that the urge to take control of all possible futures is a more fundamental principle than that of intelligence, that general intelligence may in fact emerge directly from this sort of control-grabbing, rather than vice versa.", 'Another important consequence is goal seeking.', "I'm often asked, how does the ability to seek goals follow from this sort of framework?"], ['أحد العواقب الكبرى لهذا العمل هو أنه لربما طوال كل هذه العقود، كان لدينا المفهوم العكسي للثورة الآلية.', 'الأمر ليس في أن الآلات تصبح ذكية في البداية ثم ينتابها جنون العظمة و تحاول السيطرة على العالم.', 'إنه تماماً العكس، أن النزعة للسيطرة على كل الأزمنة المستقبلية الواردة هي مبدأ أساسي أكثر من مبدأ الذكاء، أن نواحي الذكاء العامة يمكن في الحقيقة أن تنبعث مباشرة من السيطرة، بدلاً من أن يكون الأمر بالعكس.', 'عاقبة أخرى مهمة هي البحث عن الهدف.', 'إنني أُسأل غالباً، كيف يمكن تفسير قدرة البحث عن الأهداف في هذا الإطار؟'])
Preparation and utility functions¶
setting use_cache=False
as this disables optimizations being applied to transformers architecture [https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig.use_cache].
In [5]:
Copied!
model.generation_config.num_beams=1
model.generation_config.use_cache = False
model.generation_config.batch_size=2
model.generation_config.num_beams=1
model.generation_config.use_cache = False
model.generation_config.batch_size=2
In [6]:
Copied!
def get_batches(inputs,batch_size):
for i in tqdm(
range(0, len(inputs), batch_size),
desc="Generating Sequences",
total=len(inputs) // batch_size,
):
yield inputs[i : i + batch_size]
def get_batches(inputs,batch_size):
for i in tqdm(
range(0, len(inputs), batch_size),
desc="Generating Sequences",
total=len(inputs) // batch_size,
):
yield inputs[i : i + batch_size]
Translate with Huggingface generate
method¶
Using Greedy method¶
setting do_sample=False
.
In [7]:
Copied!
def translate(texts):
translated_texts = list()
for batch in get_batches(texts,batch_size=model.generation_config.batch_size):
translated_tokens = model.generate(
do_sample=False,
**tokenizer(batch, return_tensors="pt",padding=True),
)
translated_texts += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
return translated_texts
def translate(texts):
translated_texts = list()
for batch in get_batches(texts,batch_size=model.generation_config.batch_size):
translated_tokens = model.generate(
do_sample=False,
**tokenizer(batch, return_tensors="pt",padding=True),
)
translated_texts += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
return translated_texts
In [8]:
Copied!
# Example batch of input sentences
hf_predictions = translate(input_texts)
len(input_texts), len(hf_predictions), len(targets)
# Example batch of input sentences
hf_predictions = translate(input_texts)
len(input_texts), len(hf_predictions), len(targets)
Generating Sequences: 0%| | 0/5 [00:00<?, ?it/s]
Out[8]:
(10, 10, 10)
In [9]:
Copied!
bleu_scorer.compute(predictions=hf_predictions, references=targets)
bleu_scorer.compute(predictions=hf_predictions, references=targets)
Out[9]:
{'score': 15.796125110909543, 'counts': [128, 58, 28, 13], 'totals': [264, 254, 244, 235], 'precisions': [48.484848484848484, 22.834645669291337, 11.475409836065573, 5.531914893617022], 'bp': 0.9701515036966302, 'sys_len': 264, 'ref_len': 272}
With multinomial sampling, top_k, top_p, and temperature¶
In [10]:
Copied!
def translate(texts):
translated_texts = list()
for batch in get_batches(texts,batch_size=model.generation_config.batch_size):
translated_tokens = model.generate(
top_k=100,
top_p=0.8,
do_sample=True,
temperature=0.9,
**tokenizer(batch, return_tensors="pt",padding=True),
)
translated_texts += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
return translated_texts
def translate(texts):
translated_texts = list()
for batch in get_batches(texts,batch_size=model.generation_config.batch_size):
translated_tokens = model.generate(
top_k=100,
top_p=0.8,
do_sample=True,
temperature=0.9,
**tokenizer(batch, return_tensors="pt",padding=True),
)
translated_texts += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
return translated_texts
In [11]:
Copied!
# Example batch of input sentences
hf_predictions = translate(input_texts)
len(input_texts), len(hf_predictions), len(targets)
# Example batch of input sentences
hf_predictions = translate(input_texts)
len(input_texts), len(hf_predictions), len(targets)
Generating Sequences: 0%| | 0/5 [00:00<?, ?it/s]
Out[11]:
(10, 10, 10)
In [12]:
Copied!
bleu_scorer.compute(predictions=hf_predictions, references=targets)
bleu_scorer.compute(predictions=hf_predictions, references=targets)
Out[12]:
{'score': 15.895432255918095, 'counts': [124, 56, 31, 13], 'totals': [267, 257, 248, 239], 'precisions': [46.441947565543074, 21.78988326848249, 12.5, 5.439330543933054], 'bp': 0.9814476614410015, 'sys_len': 267, 'ref_len': 272}
Using beam search of width 4¶
We set explicitly do_sample=False
.
In [13]:
Copied!
def translate(texts):
translated_texts = list()
for batch in get_batches(texts,batch_size=model.generation_config.batch_size):
translated_tokens = model.generate(
num_beams=4,
do_sample=False,
length_penalty=0.6,
**tokenizer(batch, return_tensors="pt",padding=True),
)
translated_texts += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
return translated_texts
def translate(texts):
translated_texts = list()
for batch in get_batches(texts,batch_size=model.generation_config.batch_size):
translated_tokens = model.generate(
num_beams=4,
do_sample=False,
length_penalty=0.6,
**tokenizer(batch, return_tensors="pt",padding=True),
)
translated_texts += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
return translated_texts
In [14]:
Copied!
# Example batch of input sentences
beam_search_hf_predictions = translate(input_texts)
len(input_texts), len(beam_search_hf_predictions), len(targets)
# Example batch of input sentences
beam_search_hf_predictions = translate(input_texts)
len(input_texts), len(beam_search_hf_predictions), len(targets)
Generating Sequences: 0%| | 0/5 [00:00<?, ?it/s]
Out[14]:
(10, 10, 10)
In [15]:
Copied!
bleu_scorer.compute(predictions=beam_search_hf_predictions, references=targets)
bleu_scorer.compute(predictions=beam_search_hf_predictions, references=targets)
Out[15]:
{'score': 20.084845774979332, 'counts': [134, 66, 38, 21], 'totals': [262, 252, 242, 233], 'precisions': [51.14503816793893, 26.19047619047619, 15.702479338842975, 9.012875536480687], 'bp': 0.9625512774839297, 'sys_len': 262, 'ref_len': 272}
With multinomial, top-p, top-k sampling and temperature¶
In [16]:
Copied!
def translate(texts):
translated_texts = list()
for batch in get_batches(texts,batch_size=model.generation_config.batch_size):
translated_tokens = model.generate(
top_k=100,
top_p=0.8,
num_beams=4,
do_sample=True,
temperature=0.9,
length_penalty=0.6,
**tokenizer(batch, return_tensors="pt",padding=True),
)
translated_texts += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
return translated_texts
def translate(texts):
translated_texts = list()
for batch in get_batches(texts,batch_size=model.generation_config.batch_size):
translated_tokens = model.generate(
top_k=100,
top_p=0.8,
num_beams=4,
do_sample=True,
temperature=0.9,
length_penalty=0.6,
**tokenizer(batch, return_tensors="pt",padding=True),
)
translated_texts += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
return translated_texts
In [17]:
Copied!
# Example batch of input sentences
beam_search_hf_predictions = translate(input_texts)
len(input_texts), len(beam_search_hf_predictions), len(targets)
# Example batch of input sentences
beam_search_hf_predictions = translate(input_texts)
len(input_texts), len(beam_search_hf_predictions), len(targets)
Generating Sequences: 0%| | 0/5 [00:00<?, ?it/s]
Out[17]:
(10, 10, 10)
In [18]:
Copied!
bleu_scorer.compute(predictions=beam_search_hf_predictions, references=targets)
bleu_scorer.compute(predictions=beam_search_hf_predictions, references=targets)
Out[18]:
{'score': 20.198004312748168, 'counts': [135, 67, 38, 21], 'totals': [262, 252, 242, 233], 'precisions': [51.52671755725191, 26.58730158730159, 15.702479338842975, 9.012875536480687], 'bp': 0.9625512774839297, 'sys_len': 262, 'ref_len': 272}
Translate using generate-sequences¶
checking model config
In [19]:
Copied!
model.generation_config
model.generation_config
Out[19]:
GenerationConfig { "bad_words_ids": [ [ 62801 ] ], "batch_size": 2, "bos_token_id": 0, "decoder_start_token_id": 62801, "eos_token_id": 0, "forced_eos_token_id": 0, "max_length": 512, "pad_token_id": 62801, "use_cache": false }
This is the generation function that is used for both, greedy and beam search generation
In [20]:
Copied!
encoder_outputs = {}
def generate(inputs, decoder_input_ids):
global encoder_outputs
tokenizer_results = tokenizer(
inputs,
return_tensors="pt",
padding=True,
)
if not encoder_outputs.get(json.dumps(inputs)):
input_ids, attention_mask = (
tokenizer_results["input_ids"],
tokenizer_results["attention_mask"],
)
encoder_outputs[json.dumps(inputs)] = model.get_encoder()(
input_ids.repeat_interleave(
model.generation_config.num_beams,
dim=0,
),
return_dict=True,
attention_mask=attention_mask,
)
model_outputs = model(
**tokenizer_results,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs[json.dumps(inputs)],
)
return model_outputs.logits
encoder_outputs = {}
def generate(inputs, decoder_input_ids):
global encoder_outputs
tokenizer_results = tokenizer(
inputs,
return_tensors="pt",
padding=True,
)
if not encoder_outputs.get(json.dumps(inputs)):
input_ids, attention_mask = (
tokenizer_results["input_ids"],
tokenizer_results["attention_mask"],
)
encoder_outputs[json.dumps(inputs)] = model.get_encoder()(
input_ids.repeat_interleave(
model.generation_config.num_beams,
dim=0,
),
return_dict=True,
attention_mask=attention_mask,
)
model_outputs = model(
**tokenizer_results,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs[json.dumps(inputs)],
)
return model_outputs.logits
Greedy Generation¶
In [21]:
Copied!
greedy_sequences_generator = GreedyGenerator(
use_tqdm=True,
sort_inputs_by_size=True,
device=model.device,
generation_forward=generate,
batch_size=model.generation_config.batch_size,
max_length=model.generation_config.max_length,
eos_token_id=model.generation_config.eos_token_id,
decoder_start_token_id=model.generation_config.decoder_start_token_id,
)
greedy_sequences_generator = GreedyGenerator(
use_tqdm=True,
sort_inputs_by_size=True,
device=model.device,
generation_forward=generate,
batch_size=model.generation_config.batch_size,
max_length=model.generation_config.max_length,
eos_token_id=model.generation_config.eos_token_id,
decoder_start_token_id=model.generation_config.decoder_start_token_id,
)
In [22]:
Copied!
prediction_ids = greedy_sequences_generator.generate(input_texts)
predictions = tokenizer.batch_decode(prediction_ids, skip_special_tokens=True)
len(input_texts), len(predictions), len(targets)
prediction_ids = greedy_sequences_generator.generate(input_texts)
predictions = tokenizer.batch_decode(prediction_ids, skip_special_tokens=True)
len(input_texts), len(predictions), len(targets)
Generating Sequences: 0%| | 0/5 [00:00<?, ?it/s]
Out[22]:
(10, 10, 10)
In [23]:
Copied!
bleu_scorer.compute(predictions=predictions, references=targets)
bleu_scorer.compute(predictions=predictions, references=targets)
Out[23]:
{'score': 15.796125110909543, 'counts': [128, 58, 28, 13], 'totals': [264, 254, 244, 235], 'precisions': [48.484848484848484, 22.834645669291337, 11.475409836065573, 5.531914893617022], 'bp': 0.9701515036966302, 'sys_len': 264, 'ref_len': 272}
With multinomial and top-k, top-p sampling, and temperature¶
In [74]:
Copied!
greedy_sequences_generator = GreedyGenerator(
use_tqdm=True,
temperature=0.9,
top_k_sampling=100,
top_p_sampling=0.8,
device=model.device,
sort_inputs_by_size=True,
multinomial_sampling=True,
generation_forward=generate,
batch_size=model.generation_config.batch_size,
max_length=model.generation_config.max_length,
eos_token_id=model.generation_config.eos_token_id,
decoder_start_token_id=model.generation_config.decoder_start_token_id,
)
greedy_sequences_generator = GreedyGenerator(
use_tqdm=True,
temperature=0.9,
top_k_sampling=100,
top_p_sampling=0.8,
device=model.device,
sort_inputs_by_size=True,
multinomial_sampling=True,
generation_forward=generate,
batch_size=model.generation_config.batch_size,
max_length=model.generation_config.max_length,
eos_token_id=model.generation_config.eos_token_id,
decoder_start_token_id=model.generation_config.decoder_start_token_id,
)
In [75]:
Copied!
prediction_ids = greedy_sequences_generator.generate(input_texts)
predictions = tokenizer.batch_decode(prediction_ids, skip_special_tokens=True)
len(input_texts), len(predictions), len(targets)
prediction_ids = greedy_sequences_generator.generate(input_texts)
predictions = tokenizer.batch_decode(prediction_ids, skip_special_tokens=True)
len(input_texts), len(predictions), len(targets)
Generating Sequences: 0%| | 0/5 [00:00<?, ?it/s]
Out[75]:
(10, 10, 10)
In [76]:
Copied!
bleu_scorer.compute(predictions=predictions, references=targets)
bleu_scorer.compute(predictions=predictions, references=targets)
Out[76]:
{'score': 18.537916023808666, 'counts': [132, 62, 33, 19], 'totals': [266, 256, 246, 237], 'precisions': [49.62406015037594, 24.21875, 13.414634146341463, 8.016877637130802], 'bp': 0.9776961023999414, 'sys_len': 266, 'ref_len': 272}
Beam Search Generation¶
In [27]:
Copied!
beam_search_sequences_generator = BeamSearchGenerator(
beam_width=4,
use_tqdm=True,
length_penalty=0.6,
device=model.device,
sort_inputs_by_size=True,
generation_forward=generate,
batch_size=model.generation_config.batch_size,
max_length=model.generation_config.max_length,
eos_token_id=model.generation_config.eos_token_id,
decoder_start_token_id=model.generation_config.decoder_start_token_id,
)
beam_search_sequences_generator = BeamSearchGenerator(
beam_width=4,
use_tqdm=True,
length_penalty=0.6,
device=model.device,
sort_inputs_by_size=True,
generation_forward=generate,
batch_size=model.generation_config.batch_size,
max_length=model.generation_config.max_length,
eos_token_id=model.generation_config.eos_token_id,
decoder_start_token_id=model.generation_config.decoder_start_token_id,
)
In [28]:
Copied!
prediction_ids = beam_search_sequences_generator.generate(input_texts)
predictions = tokenizer.batch_decode(prediction_ids, skip_special_tokens=True)
len(input_texts), len(predictions), len(targets)
prediction_ids = beam_search_sequences_generator.generate(input_texts)
predictions = tokenizer.batch_decode(prediction_ids, skip_special_tokens=True)
len(input_texts), len(predictions), len(targets)
Generating Sequences: 0%| | 0/5 [00:00<?, ?it/s]
Out[28]:
(10, 10, 10)
In [29]:
Copied!
bleu_scorer.compute(predictions=predictions, references=targets)
bleu_scorer.compute(predictions=predictions, references=targets)
Out[29]:
{'score': 20.16216711910865, 'counts': [134, 67, 38, 21], 'totals': [261, 251, 241, 232], 'precisions': [51.34099616858238, 26.693227091633467, 15.767634854771785, 9.051724137931034], 'bp': 0.958730185172926, 'sys_len': 261, 'ref_len': 272}
With multinomial, top-p,top-k sampling, and temperature¶
In [36]:
Copied!
beam_search_sequences_generator = BeamSearchGenerator(
beam_width=4,
use_tqdm=True,
temperature=0.9,
top_k_sampling=100,
length_penalty=0.6,
top_p_sampling=0.8,
device=model.device,
sort_inputs_by_size=True,
multinomial_sampling=True,
generation_forward=generate,
batch_size=model.generation_config.batch_size,
max_length=model.generation_config.max_length,
eos_token_id=model.generation_config.eos_token_id,
decoder_start_token_id=model.generation_config.decoder_start_token_id,
)
beam_search_sequences_generator = BeamSearchGenerator(
beam_width=4,
use_tqdm=True,
temperature=0.9,
top_k_sampling=100,
length_penalty=0.6,
top_p_sampling=0.8,
device=model.device,
sort_inputs_by_size=True,
multinomial_sampling=True,
generation_forward=generate,
batch_size=model.generation_config.batch_size,
max_length=model.generation_config.max_length,
eos_token_id=model.generation_config.eos_token_id,
decoder_start_token_id=model.generation_config.decoder_start_token_id,
)
In [37]:
Copied!
prediction_ids = beam_search_sequences_generator.generate(input_texts)
predictions = tokenizer.batch_decode(prediction_ids, skip_special_tokens=True)
len(input_texts), len(predictions), len(targets)
prediction_ids = beam_search_sequences_generator.generate(input_texts)
predictions = tokenizer.batch_decode(prediction_ids, skip_special_tokens=True)
len(input_texts), len(predictions), len(targets)
Generating Sequences: 0%| | 0/5 [00:00<?, ?it/s]
Out[37]:
(10, 10, 10)
In [38]:
Copied!
bleu_scorer.compute(predictions=predictions, references=targets)
bleu_scorer.compute(predictions=predictions, references=targets)
Out[38]:
{'score': 21.700676010280976, 'counts': [134, 70, 41, 25], 'totals': [261, 251, 241, 232], 'precisions': [51.34099616858238, 27.888446215139442, 17.012448132780083, 10.775862068965518], 'bp': 0.958730185172926, 'sys_len': 261, 'ref_len': 272}