I want to use text_embeddings and combine them with output of an intermediate layer of the text_encoder of the clip. My input to the text_encoder is a learnable prompt embeddings which is intialized with random values.
Can someone help me calculate pooler output?
This is my code that I have written. I tried to calculate the pooler output by using code from the transformers library but I noticed they used input_ids and I don't have those.
I tried to add a CLS, SOS, EOS token using dummy input but I am not sure if that is the correct approach
import torch
from transformers import CLIPProcessor, CLIPTextModelWithProjection, CLIPTokenizerFast
text_embeddings = torch.randn(2, 4, 512)
ctx = torch.randn(2, 16, 512)
hf_tokenizer = CLIPTokenizerFast.from_pretrained("wisdomik/QuiltNet-B-32")
hf_text_encoder = CLIPTextModelWithProjection.from_pretrained("wisdomik/QuiltNet-B-32")
transformer = hf_text_encoder.text_model
final_ln = transformer.final_layer_norm
proj = hf_text_encoder.text_projection
encoder = transformer.encoder
# def layer_hook(module, inputs, outputs):
# print(f"Entered hook")
# print(f"outputs: {outputs[0].shape}")
# outputs_ = outputs[0][:, text_embeddings.shape[1]:, :]
# print(f"Extracted {outputs_.shape}")
# out = torch.cat([text_embeddings, outputs_], dim=1)
# print(f"concatenated: {out.shape}")
# return [out]
# encoder_layers[4].register_forward_hook(layer_hook)
# dummy_token = hf_tokenizer(["X" * 5], context_length=77).to("cuda")
# with torch.no_grad():
# embedding = hf_text_encoder(dummy_token).unsqueeze(0)
# prefix = embedding[:, :1, :] # SOS
# suffix = embedding[:, 1 + 16 :, :] # CLS, EOS
# x = torch.cat([
# prefix, # (n_cls, 1, dim)
# ctx,
# suffix, # (n_cls, *, dim)
# ],
# dim=1)
for idx, layer in enumerate(encoder.layers):
x = layer(x, attention_mask=None, causal_attention_mask=None)[0]
if idx == 4:
x = x[:, text_embeddings.shape[1]:, :]
x = torch.cat([text_embeddings, x], dim=1)
x = final_ln(x)
# eos_token_id = 49407
# pooled_output = x[
# torch.arange(x.shape[0], device=x.device),
# # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
# # Note: we assume each sequence (along batch dim.) contains an `eos_token_id` (e.g. prepared by the tokenizer)
# (dummy_token.to(dtype=torch.int, device=x.device) == eos_token_id).int().argmax(dim=-1),
# ]
o = proj(x)
print(f"Output: {o.shape}")
This is what transformers library does:
if self.eos_token_id == 2:
# The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
# A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
# ------------------------------------------------------------
# text_embeds.shape = [batch_size, sequence_length, transformer.width]
# take features from the eot embedding (eot_token is the highest number in each sequence)
# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
pooled_output = last_hidden_state[
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
]
else:
# The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
pooled_output = last_hidden_state[
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
# We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
# Note: we assume each sequence (along batch dim.) contains an `eos_token_id` (e.g. prepared by the tokenizer)
(input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
.int()
.argmax(dim=-1),
]
https://github.com/huggingface/transformers/blob/v4.53.2/src/transformers/models/clip/modeling_clip.py at line 639
I want shape [batch_size, embed_dim] and I am getting now is [batch_size, seq_len, embed_dim]