Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fuyu processor: box coordinates #27083

Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion src/transformers/models/fuyu/processing_fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def to(self, device: Union[str, "torch.device"]) -> "FuyuBatchEncoding":
if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
for batch_key, batch_element in self.data.items():
if isinstance(batch_element, list):
moved_element = [item.to("cuda") for item in batch_element]
moved_element = [item.to(device=device) for item in batch_element]
else:
moved_element = batch_element.to(device=device)
self.data[batch_key] = moved_element
Expand Down Expand Up @@ -652,6 +652,83 @@ def __call__(

return FuyuBatchEncoding(data=batch_encoding)

def post_process_box_coordinates(self, outputs, target_sizes=None):
"""
Transforms raw coordinates detected by [`FuyuForCausalLM`] to the original images' coordinate space.
Coordinates will be returned in "box" format, with the following pattern:
`<box>top, left, bottom, right</box>`

Point coordinates are not supported yet.

Args:
outputs ([`GenerateOutput`]):
Raw outputs from `generate`.
target_sizes (`torch.Tensor`, *optional*):
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
the batch. If set, found coordinates in the output sequence are rescaled to the target sizes. If left
to None, coordinates will not be rescaled.

Returns:
`GenerateOutput`: Same output type returned by `generate`, with output token ids replaced with
boxed and possible rescaled coordinates.
"""

def scale_factor_to_fit(original_size, target_size=None):
height, width = original_size
if target_size is None:
max_height = self.image_processor.target_height
max_width = self.image_processor.target_width
else:
max_height, max_width = target_size
if width <= max_width and height <= max_height:
return 1.0
return min(max_height/height, max_width/width)

def tokens_to_box(tokens, original_size):
bbox_start = self.tokenizer.convert_tokens_to_ids(BBOX_OPEN_STRING)
bbox_end = self.tokenizer.convert_tokens_to_ids(BBOX_CLOSE_STRING)
try:
# Assumes a single box output per sequence
bbox_start_pos = (tokens == bbox_start).nonzero(as_tuple=True)[0].item()
bbox_end_pos = (tokens == bbox_end).nonzero(as_tuple=True)[0].item()
except:
return tokens

if bbox_end_pos != bbox_start_pos + 5:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does the 5 come from here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, should have explained!

The model returns coordinates in the following format:

  • Beginning of bbox delimiter, which is a single token id.
  • 4 token ids corresponding to the scaled coordinate numbers, without any delimiters.
  • End of bbox delimiter, another single token id.

So we find the begin and end delimiters, and verify that there are exactly 4 token ids in-between.

The same approach is taken for 2d point coordinates, I'll incorporate them now as well as the reverse pre-processing transformation.

Thanks a lot for the quick review and comments @amyeroberts!

return tokens

# Retrieve transformed coordinates from tokens
coords = self.tokenizer.convert_ids_to_tokens(tokens[bbox_start_pos+1:bbox_end_pos])

# Scale back to original image size and multiply by 2
scale = scale_factor_to_fit(original_size)
top, left, bottom, right = [2 * int(float(c)/scale) for c in coords]

# Replace the IDs so they get detokenized right
replacement = f" <box>{top}, {left}, {bottom}, {right}</box>"
replacement = self.tokenizer.tokenize(replacement)[1:]
replacement = self.tokenizer.convert_tokens_to_ids(replacement)
replacement = torch.tensor(replacement).to(tokens)

tokens = torch.cat([tokens[:bbox_start_pos], replacement, tokens[bbox_end_pos+1:]], 0)
return tokens

if target_sizes is None:
target_sizes = ((self.image_processor.target_height, self.image_processor.target_width),) * len(outputs)

if len(outputs) != len(target_sizes):
raise ValueError("Make sure that you pass in as many target sizes as output sequences")
if target_sizes.shape[1] != 2:
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")

results = None
for seq, size in zip(outputs, target_sizes):
seq = tokens_to_box(seq, size)[None, :]
# TODO: what if sequence lengths vary?
results = seq if results is None else torch.cat((results, seq), dim=0)

return results

def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
Expand Down