mixvideo-v2/cargos/tvai-v2/视觉-语言模型配置/ViT-L-14-CLIPA.json

25 lines
576 B
JSON

{
"embed_dim": 768,
"vision_cfg": {
"image_size": 224,
"layers": 24,
"width": 1024,
"patch_size": 14,
"no_ln_pre": true,
"pool_type": "avg",
"final_ln_after_pool": true
},
"text_cfg": {
"context_length": 32,
"vocab_size": 32000,
"hf_tokenizer_name": "bert-base-uncased",
"tokenizer_kwargs": {
"strip_sep_token": true
},
"width": 768,
"heads": 12,
"layers": 12,
"pool_type": "last",
"no_causal_mask": true
}
}