mixvideo-v2/cargos/tvai-v2/视觉-语言模型配置/coca_ViT-B-32.json

30 lines
659 B
JSON

{
"embed_dim": 512,
"vision_cfg": {
"image_size": 224,
"layers": 12,
"width": 768,
"patch_size": 32,
"attentional_pool": true,
"attn_pooler_heads": 8,
"output_tokens": true
},
"text_cfg": {
"context_length": 76,
"vocab_size": 49408,
"width": 512,
"heads": 8,
"layers": 12,
"embed_cls": true,
"output_tokens": true
},
"multimodal_cfg": {
"context_length": 76,
"vocab_size": 49408,
"width": 512,
"heads": 8,
"layers": 12,
"attn_pooler_heads": 8
},
"custom_text": true
}