mixvideo-v2/cargos/tvai-v2/视觉-语言模型配置/MobileCLIP-B.json

21 lines
483 B
JSON

{
"embed_dim": 512,
"vision_cfg": {
"timm_model_name": "vit_base_mci_224",
"timm_model_pretrained": false,
"timm_pool": "token",
"timm_proj": null,
"timm_drop": 0.0,
"timm_drop_path": 0.0,
"image_size": 224
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 512,
"heads": 8,
"layers": 12,
"no_causal_mask": false
},
"custom_text": true
}