mixvideo-v2/cargos/tvai-v2/视觉-语言模型配置/ViT-L-16-SigLIP2-512.json

{
    "embed_dim": 1024,
    "init_logit_bias": -10,
    "custom_text": true,
    "vision_cfg": {
        "image_size": 512,
        "timm_model_name": "vit_large_patch16_siglip_512",
        "timm_model_pretrained": false,
        "timm_pool": "map",
        "timm_proj": "none"
    },
    "text_cfg": {
        "context_length": 64,
        "vocab_size": 256000,
        "hf_tokenizer_name": "timm/ViT-L-16-SigLIP2-512",
        "tokenizer_kwargs": {
            "clean": "canonicalize"
        },
        "width": 1024,
        "heads": 16,
        "layers": 24,
        "no_causal_mask": true,
        "proj_bias": true,
        "pool_type": "last",
        "norm_kwargs":{
            "eps": 1e-6
        },
        "act_kwargs": {
            "approximate": "tanh"
        }
    }
}