Source code for xmodaler.datasets.images.mscoco_bert

# Copyright 2021 JD.com, Inc., JD AI
"""
@author: Jianjie Luo
@contact: jianjieluo.sysu@gmail.com
"""
import os
import copy
import json
import pickle
from tqdm import tqdm
import random
import numpy as np

from xmodaler.config import configurable
from xmodaler.config import kfg
from xmodaler.tokenization import BertTokenizer
from xmodaler.functional import (
    read_lines_set, 
    read_np, 
    boxes_to_locfeats, 
    dict_as_tensor
)

from ..build import DATASETS_REGISTRY

__all__ = ["MSCoCoBertDataset"]

def assert_eq(real, expected):
    assert real == expected, "%s (true) vs %s (expected)" % (real, expected)

def encode_sequences_bert(entries, tokenizer, cls_token_id, sep_token_id, pad_token_id, max_length):
    for entry in tqdm(entries, desc='BertTokenize Coco Seqs'):
        entry['tokens_ids'] = []
        entry['target_ids'] = []

        for sent in entry['caption']:
            tokens = tokenizer.encode(sent)
            input_seq = [cls_token_id] + tokens
            target_seq = tokens + [sep_token_id]

            input_seq = input_seq[: max_length]
            target_seq = target_seq[: max_length]

            if len(input_seq) < max_length:
                padding = [pad_token_id] * (max_length - len(input_seq))
                tpadding = [-1] * (max_length - len(input_seq))
                input_seq = input_seq + padding
                target_seq = target_seq + tpadding

            assert_eq(len(input_seq), max_length)
            assert_eq(len(target_seq), max_length)

            entry['tokens_ids'].append(input_seq)
            entry['target_ids'].append(target_seq)

        entry.pop('caption')
        entry['tokens_ids'] = np.array(entry['tokens_ids'], dtype='uint32')
        entry['target_ids'] = np.array(entry['target_ids'], dtype='int32')

    return entries


[docs]@DATASETS_REGISTRY.register() class MSCoCoBertDataset:
[docs] @configurable def __init__( self, stage: str, anno_file: str, seq_per_img: int, max_seq_length: int, max_feat_num: int, feats_folder: str, images_ids_file: str, tokenizer ): self.stage = stage self.anno_file = anno_file self.seq_per_img = seq_per_img self.max_seq_length = max_seq_length self.max_feat_num = max_feat_num self.feats_folder = feats_folder self.images_ids_file = images_ids_file self.tokenizer = tokenizer
[docs] @classmethod def from_config(cls, cfg, stage: str = "train"): ann_files = { "train": os.path.join(cfg.DATALOADER.ANNO_FOLDER, "dataset_coco.json"), "val": os.path.join(cfg.DATALOADER.ANNO_FOLDER, "dataset_coco.json"), "test": os.path.join(cfg.DATALOADER.ANNO_FOLDER, "dataset_coco.json"), } images_ids_files = { "train": os.path.join(cfg.DATALOADER.ANNO_FOLDER, "coco_train_image_id.txt"), "val": os.path.join(cfg.DATALOADER.ANNO_FOLDER, "coco_val_image_id.txt"), "test": os.path.join(cfg.DATALOADER.ANNO_FOLDER, "coco_test_image_id.txt") } ret = { "stage": stage, "anno_file": ann_files[stage], "max_seq_length": cfg.MODEL.MAX_SEQ_LEN, "seq_per_img": cfg.DATALOADER.SEQ_PER_SAMPLE, "max_feat_num": cfg.DATALOADER.MAX_FEAT_NUM, "feats_folder": cfg.DATALOADER.FEATS_FOLDER, "images_ids_file": images_ids_files[stage], "tokenizer": BertTokenizer.from_pretrained(cfg.MODEL.PRETRAINING.MODEL_NAME, do_lower_case=cfg.MODEL.PRETRAINING.DO_LOWER_CASE), } return ret
[docs] def load_data(self, cfg): # Load mscoco data like pretraining cache_files = { "train": os.path.join(cfg.DATALOADER.ANNO_FOLDER, "mscoco_bert_caption_anno_train.pkl"), "val": os.path.join(cfg.DATALOADER.ANNO_FOLDER, "mscoco_bert_caption_anno_val.pkl"), "test": os.path.join(cfg.DATALOADER.ANNO_FOLDER, "mscoco_bert_caption_anno_test.pkl") } if not os.path.exists(cache_files[self.stage]): images_ids_set = read_lines_set(self.images_ids_file) entries = [] annotation = json.load(open(self.anno_file, 'r'))['images'] for img in annotation: image_id = str(img['cocoid']) if image_id not in images_ids_set: continue sentences = [] for sent in img['sentences']: sentences.append(sent['raw'].lower().strip().strip('.')) entries.append({"caption": sentences, "image_id": image_id}) # pre-tokenize mscoco caption cls_token_id = self.tokenizer.vocab["[CLS]"] sep_token_id = self.tokenizer.vocab["[SEP]"] pad_token_id = self.tokenizer.vocab["[PAD]"] datalist = encode_sequences_bert(entries, self.tokenizer, cls_token_id, sep_token_id, pad_token_id, self.max_seq_length) pickle.dump(datalist, open(cache_files[self.stage], "wb")) else: datalist = pickle.load(open(cache_files[self.stage], "rb")) return datalist
[docs] def __call__(self, dataset_dict): dataset_dict = copy.deepcopy(dataset_dict) image_id = dataset_dict['image_id'] # load image feature image_path = os.path.join(self.feats_folder, image_id + ".npz") content = read_np(image_path) features = content['features'][0:self.max_feat_num - 1] boxes = content['boxes'][0:self.max_feat_num - 1] image_h = content['image_h'][0] image_w = content['image_w'][0] num_boxes = len(boxes) image_locations = boxes_to_locfeats(boxes, image_w, image_h) # add g_feat imgfeats = features g_image_feat = np.sum(imgfeats, axis=0) / num_boxes g_image_location = np.array([0, 0, 1, 1, 1]) num_boxes = num_boxes + 1 imgfeats = np.concatenate([np.expand_dims(g_image_feat, axis=0), imgfeats], axis=0) image_locations = np.concatenate([np.expand_dims(g_image_location, axis=0), image_locations], axis=0) # build visual output ret = { kfg.IDS: image_id, kfg.ATT_FEATS: imgfeats.astype('float32'), kfg.ATT_FEATS_LOC: image_locations.astype('float32') } # if test if self.stage != 'train': g_tokens_type = np.ones((self.max_seq_length,), dtype=np.int64) ret.update({ kfg.G_TOKENS_TYPE: g_tokens_type }) dict_as_tensor(ret) return ret # load caption data sent_num = len(dataset_dict['tokens_ids']) if sent_num >= self.seq_per_img: selects = random.sample(range(sent_num), self.seq_per_img) else: selects = random.choices(range(sent_num), k = (self.seq_per_img - sent_num)) selects += list(range(sent_num)) tokens_ids = [ dataset_dict['tokens_ids'][i,:].astype(np.int64) for i in selects ] target_ids = [ dataset_dict['target_ids'][i,:].astype(np.int64) for i in selects ] g_tokens_type = [ np.ones((len(dataset_dict['tokens_ids'][i,:]), ), dtype=np.int64) for i in selects ] ret.update({ kfg.SEQ_PER_SAMPLE: self.seq_per_img, kfg.G_TOKENS_IDS: tokens_ids, kfg.G_TARGET_IDS: target_ids, kfg.G_TOKENS_TYPE: g_tokens_type, }) dict_as_tensor(ret) return ret