-
Notifications
You must be signed in to change notification settings - Fork 180
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Perplexity Eval for Text Generation Models #1073
Changes from 69 commits
48ac0ac
cf7f2b9
832630a
9958c83
e6d2b03
7f9935b
b1cf01b
0a3f48d
add4625
22d2746
7f1651d
b85746d
aadc608
58bc2b0
d538444
e19676b
7908b74
4bc3472
c07f7ed
fb77838
2097463
5eb10a9
9213f29
d9af004
476f25d
fab44e4
d454e2f
1613e25
b61055c
6ee25fc
5d3004b
ace6fa5
388586d
afd0139
30eeda7
5882b56
4bbe33d
afa5746
e2bb78c
2299009
2935b77
b89b156
dc3d61b
a294265
af97f2b
c117788
4ad5f49
9e816bb
f97467f
6be8d87
e2f088d
9fc6c64
a610faf
347d1fb
e11027c
a950910
c1d02dc
711cdfb
e602662
2085c37
2f7bc95
e18fab7
0358d87
06b5246
2cab681
63b116b
7001a6e
79251e6
9efbdb6
da5e93e
a680dac
f83dcab
e659c33
cf74ad7
853f876
e8da07e
58b12c8
eecd232
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,6 +89,8 @@ class TextGenerationPipeline(TransformersPipeline): | |
of tokens supplied even if the stop token is reached. | ||
:param use_deepsparse_cache: if True, the pipeline will use the deepsparse kv cache | ||
for caching the model outputs. | ||
:param tokenizer_padding_side: the side to pad the input sequence to. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as discussed offline - running right padded for eval will likely not work for the engine (single token prefill) as internally they will build the KV cache assuming left padded and pop from the left side of cache as its built up. in right padded scenario I believe this will delete the actual non-padded values from cache too early. |
||
Either "left" or "right". Defaults to "left". | ||
:param kwargs: kwargs to pass to the TransformersPipeline | ||
""" | ||
|
||
|
@@ -101,6 +103,7 @@ def __init__( | |
prompt_processing_sequence_length: int = 128, | ||
force_max_tokens: bool = False, | ||
use_deepsparse_cache: bool = False, | ||
tokenizer_padding_side: str = "left", | ||
**kwargs, | ||
): | ||
if use_deepsparse_cache: | ||
|
@@ -126,8 +129,7 @@ def __init__( | |
self.prompt_processing_sequence_length = prompt_processing_sequence_length | ||
self.force_max_tokens = force_max_tokens | ||
|
||
# override tokenizer to pad to left | ||
self.tokenizer.padding_side = "left" | ||
self.tokenizer.padding_side = tokenizer_padding_side | ||
|
||
self.engine = None | ||
self.multitoken_engine = NLDecoderEngine( | ||
|
@@ -207,6 +209,8 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: | |
return_tensors="np", | ||
max_length=self.sequence_length, | ||
padding="max_length", | ||
# TODO: Truncating by default may be a problem | ||
truncation=True, | ||
) | ||
|
||
attention_mask = input_tokens["attention_mask"] | ||
|
@@ -240,7 +244,9 @@ def process_engine_outputs( | |
""" | ||
generated_tokens, generated_logits = engine_outputs | ||
sequences = self.tokenizer.batch_decode( | ||
*generated_tokens, skip_special_tokens=True | ||
# TODO: hack for now, make it general | ||
*generated_tokens[0], | ||
skip_special_tokens=True, | ||
) | ||
logits = generated_logits if kwargs.get("return_logits") else None | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need all the logits that are predicted from sequences: {}, {x1}, {x1, x2}, ... {x1, x2, ... x_n}