Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement of Zero-Shot CLIP Classifier #1737

Merged
merged 8 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions configs/clip_zs/clip-vit-base-patch16_cifar100.py
Copy link
Collaborator

@fangyixiao18 fangyixiao18 Aug 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move all these 4 configs to configs/clip/ folder, and rename them to format like clip_vit-base-p16_zeroshot-cls_cifar100.py, just as chinese_clip

Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
_base_ = '../_base_/default_runtime.py'

# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
to_rgb=False,
)

test_pipeline = [
dict(type='Resize', scale=(224, 224), interpolation='bicubic'),
dict(
type='PackInputs',
algorithm_keys=['text'],
meta_keys=['image_id', 'scale_factor'],
),
]

train_dataloader = None
test_dataloader = dict(
batch_size=32,
num_workers=8,
dataset=dict(
type='CIFAR100',
data_root='/public/DATA/qbw/img_cls_dataset/cifar100',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
data_root='/public/DATA/qbw/img_cls_dataset/cifar100',
data_root='data/cifar100',

we use this format path instead of your real path in the released code, please also check other configs

split='test',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
)
test_evaluator = dict(type='Accuracy', topk=(1, 5))

# schedule settings
train_cfg = None
val_cfg = None
test_cfg = dict()

# model settings
model = dict(
type='CLIP_zs',
vision_backbone=dict(
type='CLIPVisionTransformer',
input_resolution=224,
patch_size=16,
width=768,
layers=12,
heads=12,
output_dim=512,
),
text_backbone=dict(
type='CLIPTransformer',
width=512,
layers=12,
heads=8,
attn_mask=True,
),
tokenizer=dict(
type='AutoTokenizer',
name_or_path='openai/clip-vit-base-patch16',
use_fast=False),
vocab_size=49408,
transformer_width=512,
proj_dim=512,
text_prototype='cifar100',
text_prompt='openai_cifar100',
context_length=77,
)
69 changes: 69 additions & 0 deletions configs/clip_zs/clip-vit-base-patch16_in1k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
_base_ = '../_base_/default_runtime.py'

# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
to_rgb=True,
)

test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='Resize', scale=(224, 224), interpolation='bicubic'),
dict(
type='PackInputs',
algorithm_keys=['text'],
meta_keys=['image_id', 'scale_factor'],
),
]

train_dataloader = None
test_dataloader = dict(
batch_size=32,
num_workers=8,
dataset=dict(
type='ImageNet',
data_root=
'/public/DATA/qbw/img_cls_dataset/in1k/imagenet-1k-huggingface/data/',
split='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
)
test_evaluator = dict(type='Accuracy', topk=(1, 5))

# schedule settings
train_cfg = None
val_cfg = None
test_cfg = dict()

# model settings
model = dict(
type='CLIP_zs',
vision_backbone=dict(
type='CLIPVisionTransformer',
input_resolution=224,
patch_size=16,
width=768,
layers=12,
heads=12,
output_dim=512,
),
text_backbone=dict(
type='CLIPTransformer',
width=512,
layers=12,
heads=8,
attn_mask=True,
),
tokenizer=dict(
type='AutoTokenizer',
name_or_path='openai/clip-vit-base-patch16',
use_fast=False),
vocab_size=49408,
transformer_width=512,
proj_dim=512,
text_prototype='imagenet',
text_prompt='openai_imagenet_sub', # openai_imagenet, openai_imagenet_sub
context_length=77,
)
67 changes: 67 additions & 0 deletions configs/clip_zs/clip-vit-large-patch14_cifar100.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
_base_ = '../_base_/default_runtime.py'

# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
to_rgb=False,
)

test_pipeline = [
dict(type='Resize', scale=(224, 224), interpolation='bicubic'),
dict(
type='PackInputs',
algorithm_keys=['text'],
meta_keys=['image_id', 'scale_factor'],
),
]

train_dataloader = None
test_dataloader = dict(
batch_size=32,
num_workers=8,
dataset=dict(
type='CIFAR100',
data_root='/public/DATA/qbw/img_cls_dataset/cifar100',
split='test',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
)
test_evaluator = dict(type='Accuracy', topk=(1, 5))

# schedule settings
train_cfg = None
val_cfg = None
test_cfg = dict()

# model settings
model = dict(
type='CLIP_zs',
vision_backbone=dict(
type='CLIPVisionTransformer',
input_resolution=224,
patch_size=14,
width=1024,
layers=24,
heads=16,
output_dim=768,
),
text_backbone=dict(
type='CLIPTransformer',
width=768,
layers=12,
heads=12,
attn_mask=True,
),
tokenizer=dict(
type='AutoTokenizer',
name_or_path='openai/clip-vit-large-patch14',
use_fast=False),
vocab_size=49408,
transformer_width=768,
proj_dim=768,
text_prototype='cifar100',
text_prompt='openai_cifar100',
context_length=77,
)
69 changes: 69 additions & 0 deletions configs/clip_zs/clip-vit-large-patch14_in1k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
_base_ = '../_base_/default_runtime.py'

# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
to_rgb=True,
)

test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='Resize', scale=(224, 224), interpolation='bicubic'),
dict(
type='PackInputs',
algorithm_keys=['text'],
meta_keys=['image_id', 'scale_factor'],
),
]

train_dataloader = None
test_dataloader = dict(
batch_size=32,
num_workers=8,
dataset=dict(
type='ImageNet',
data_root=
'/public/DATA/qbw/img_cls_dataset/in1k/imagenet-1k-huggingface/data/',
split='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
)
test_evaluator = dict(type='Accuracy', topk=(1, 5))

# schedule settings
train_cfg = None
val_cfg = None
test_cfg = dict()

# model settings
model = dict(
type='CLIP_zs',
vision_backbone=dict(
type='CLIPVisionTransformer',
input_resolution=224,
patch_size=14,
width=1024,
layers=24,
heads=16,
output_dim=768,
),
text_backbone=dict(
type='CLIPTransformer',
width=768,
layers=12,
heads=12,
attn_mask=True,
),
tokenizer=dict(
type='AutoTokenizer',
name_or_path='openai/clip-vit-large-patch14',
use_fast=False),
vocab_size=49408,
transformer_width=768,
proj_dim=768,
text_prototype='imagenet',
text_prompt='openai_imagenet_sub', # openai_imagenet, openai_imagenet_sub
context_length=77,
)
Loading