siglip2 zero-shot
utils
本文字数:389 字 | 阅读时长 ≈ 2 min

siglip2 zero-shot

utils
本文字数:389 字 | 阅读时长 ≈ 2 min

全部大写字母改为小写,同时不要加句号

import torch
from transformers import AutoModel, AutoProcessor
from transformers.image_utils import load_image
import torch
from PIL import Image
from open_clip import create_model_from_pretrained, get_tokenizer
import numpy as np
import warnings
warnings.filterwarnings("ignore")


print("Using HF siglip2-so400m-patch14-384 model")
ckpt = "/ssd/1/wangyh/vitv2/TBStarsViT/siglip2-so400m-patch14-384"
model = AutoModel.from_pretrained(ckpt).eval().cuda()
processor = AutoProcessor.from_pretrained(ckpt, use_fast=True)
# image = "/ssd/1/wangyh/data/evaluation/imagenet1k_zero_shot/val/n02085620/ILSVRC2012_val_00002921.JPEG"
image = "/ssd/1/wangyh/data/evaluation/imagenet1k_zero_shot/val/n02085620/ILSVRC2012_val_00006079.JPEG"

image = load_image(image)
# 149, 150, *151, 152, 904, 435
candidate_labels = ['dugong', 'sea lion', 'chihuahua', 'japanese chin', 'window screen', 'bathtub']
texts = [f'this is a photo of {label}' for label in candidate_labels]
inputs = processor(text=texts, images=image, padding="max_length", max_length=64, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = torch.sigmoid(logits_per_image)
print(np.array(probs.cpu()).tolist())
zipped_list = list(zip(candidate_labels, [100 * round(p.item(), 3) for p in probs[0]]))
print("probabilities: ", zipped_list)




##################
print("\n\nUsing timm siglip2-so400m-patch14-384 model")
model, preprocess = create_model_from_pretrained('local-dir:/ssd/1/wangyh/EVA/EVA-02/asuka/ckpt/google/timm-siglip2-so400m-patch14-384/')
model.eval()
tokenizer = get_tokenizer('local-dir:/ssd/1/wangyh/EVA/EVA-02/asuka/ckpt/google/timm-siglip2-so400m-patch14-384/')

# image = "/ssd/1/wangyh/data/evaluation/imagenet1k_zero_shot/val/n02085620/ILSVRC2012_val_00002921.JPEG"
image = "/ssd/1/wangyh/data/evaluation/imagenet1k_zero_shot/val/n02085620/ILSVRC2012_val_00006079.JPEG"
image = Image.open(image)
image = preprocess(image).unsqueeze(0)
tokenizer = get_tokenizer('local-dir:/ssd/1/wangyh/EVA/EVA-02/asuka/ckpt/google/timm-siglip2-so400m-patch14-384/')
candidate_labels = ['dugong', 'sea lion', 'Chihuahua', 'Japanese Chin', 'window screen', 'bathtub']
labels_list = [f'This is a photo of {label}.' for label in candidate_labels]
text = tokenizer(labels_list, context_length=64)
with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image, normalize=True)
    text_features = model.encode_text(text, normalize=True)
    probs = torch.sigmoid(image_features @ text_features.T * model.logit_scale.exp() + model.logit_bias)
print(np.array(probs.cpu()).tolist())
zipped_list = list(zip(candidate_labels, [100 * round(p.item(), 3) for p in probs[0]]))
print("probabilities: ", zipped_list)
5月 06, 2025