目录
open_clip 安装
解决方法
ViewCrafter 用到了
FrozenOpenCLIPEmbedder
open_clip 安装
pip install open-clip-torch==2.17.1
使用报错:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='laion/CLIP-ViT-L-14-laion2B-s32B-b82K/pytorch_model.bin')
报错
model, _, preprocess = open_clip.create_model_and_transforms(File "/data/.local/lib/python3.10/site-packages/open_clip/factory.py", line 382, in create_model_and_transformsmodel = create_model(File "/data/.local/lib/python3.10/site-packages/open_clip/factory.py", line 288, in create_modelload_checkpoint(model, checkpoint_path)File "/data/.local/lib/python3.10/site-packages/open_clip/factory.py", line 159, in load_checkpointincompatible_keys = model.load_state_dict(state_dict, strict=strict)File "/data/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2152, in load_state_dictraise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for CLIP:Missing key(s) in state_dict: "positional_embedding", "text_projection", "visual.class_embedding", "visual.positional_embedding", "visual.proj"
解决方法
到 https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/tree/main 下载 open_clip_pytorch_model.bin
修改代码
model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='laion/CLIP-ViT-L-14-laion2B-s32B-b82K/open_clip_pytorch_model.bin')
原文链接:https://blog.csdn.net/zengNLP/article/details/135644453
ViewCrafter 用到了
/mnt/data-2/users/libanggeng/project/drag/ViewCrafter/lvdm/modules/encoders/condition.py
model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
FrozenOpenCLIPEmbedder
class FrozenOpenCLIPEmbedder(AbstractEncoder):"""Uses the OpenCLIP transformer encoder for text"""LAYERS = [# "pooled","last","penultimate"]def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,freeze=True, layer="last"):super().__init__()assert layer in self.LAYERSmodel, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)del model.visualself.model = modelself.device = deviceself.max_length = max_lengthif freeze:self.freeze()self.layer = layerif self.layer == "last":self.layer_idx = 0elif self.layer == "penultimate":self.layer_idx = 1else:raise NotImplementedError()def freeze(self):self.model = self.model.eval()for param in self.parameters():param.requires_grad = Falsedef forward(self, text):tokens = open_clip.tokenize(text) ## all clip models use 77 as context lengthz = self.encode_with_transformer(tokens.to(self.device))return zdef encode_with_transformer(self, text):x = self.model.token_embedding(text) # [batch_size, n_ctx, d_model]x = x + self.model.positional_embeddingx = x.permute(1, 0, 2) # NLD -> LNDx = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)x = x.permute(1, 0, 2) # LND -> NLDx = self.model.ln_final(x)return x