Source code for xmodaler.datasets.images.vcr

# Copyright 2021 JD.com, Inc., JD AI
"""
@author: Yehao Li, Jianjie Luo
@contact: yehaoli.sysu@gmail.com, jianjieluo.sysu@gmail.com
"""
import os
import copy
import pickle
import random
import json
import json_lines
import numpy as np
from xmodaler.config import configurable
from xmodaler.config import kfg
from xmodaler.functional import read_np, dict_as_tensor, boxes_to_locfeats, read_np_bbox
from xmodaler.tokenization import BertTokenizer
from ..build import DATASETS_REGISTRY

__all__ = ["VCRDataset"]

[docs]@DATASETS_REGISTRY.register()
class VCRDataset:
[docs]    @configurable
    def __init__(
        self,
        stage: str,
        task_name: str,  # VCR_Q-A or VCR_QA-R
        anno_folder: str,
        feats_folder: str,
        max_feat_num: int,
        max_seq_len: int,
        seq_per_img: int,
        use_global_v: bool,
        tokenizer
    ):
        self.stage = stage
        self.task_name = task_name
        self.anno_folder = anno_folder
        self.feats_folder = feats_folder
        self.gt_feat_folder = feats_folder + "_gt"
        self.max_feat_num = max_feat_num
        self.seq_per_img = seq_per_img
        self.use_global_v = use_global_v
        self.tokenizer = tokenizer

        if self.task_name == 'VCR_Q-A':
            self.max_seq_len = 38
        else:
            self.max_seq_len = 66

        self.names = [
            'Casey', 'Riley', 'Jessie', 'Jackie', 
            'Avery', 'Jaime', 'Peyton', 'Kerry', 
            'Jody', 'Kendall', 'Frankie', 'Pat', 'Quinn']

[docs]    @classmethod
    def from_config(cls, cfg, stage: str = "train;VCR_Q-A"):
        stage, task_name = stage.split(';')

        ret = {
            "stage": stage,
            "task_name": task_name,
            "anno_folder": cfg.DATALOADER.ANNO_FOLDER,
            "feats_folder": cfg.DATALOADER.FEATS_FOLDER,
            "max_feat_num": cfg.DATALOADER.MAX_FEAT_NUM,
            "max_seq_len": cfg.MODEL.MAX_SEQ_LEN,
            "seq_per_img": cfg.DATALOADER.SEQ_PER_SAMPLE,
            "use_global_v": cfg.DATALOADER.USE_GLOBAL_V,
            "tokenizer": BertTokenizer.from_pretrained(cfg.MODEL.PRETRAINING.MODEL_NAME,
                do_lower_case=cfg.MODEL.PRETRAINING.DO_LOWER_CASE),
        }
        return ret

[docs]    def load_data(self, cfg):
        cache_path = os.path.join(
            self.anno_folder, "cache",
            "%s_%s_%d.pkl" % (self.task_name, self.stage, self.max_seq_len)
        )
        if not os.path.exists(cache_path):
            datalist = self.load_raw_data(cfg)
            self.tokenize(datalist)
            pickle.dump(datalist, open(cache_path, "wb"))
        datalist = pickle.load(open(cache_path, "rb"))
        return datalist

    def tokenize(self, datalist):
        person_name_id = 0
        for entry in datalist:
            objects_replace_name = []
            for o in entry['objects']:
                if o == 'person':
                    objects_replace_name.append(self.names[person_name_id])
                    person_name_id = (person_name_id + 1) % len(self.names)
                else:
                    objects_replace_name.append(o)

            tokens_q = self.retokenize_and_convert_to_ids(entry["question"], objects_replace_name)
            if self.task_name == "VCR_QA-R":
                tokens_q2 = self.retokenize_and_convert_to_ids(entry["question_a"], objects_replace_name)

            tokens_qr_arr = []
            u_tokens_types_arr = []
            for answer in entry["answers"]:
                tokens_r = self.retokenize_and_convert_to_ids(answer, objects_replace_name)

                if self.task_name == "VCR_Q-A":
                    tokens_q_copy = copy.copy(tokens_q)
                    self.truncate_seq_pair(tokens_q_copy, tokens_r, self.max_seq_len - 3)
                else:
                    tokens_q_copy = copy.copy(tokens_q)
                    tokens_q2_copy = copy.copy(tokens_q2)
                    self.truncate_seq_tri(tokens_q_copy, tokens_q2_copy, tokens_r , self.max_seq_len - 3)
                    tokens_q_copy = tokens_q_copy + tokens_q2_copy

                u_tokens_types = [0] * (len(tokens_q_copy) + 2) + [1] * (len(tokens_r) + 1)
                tokens_qr = self.tokenizer.add_special_tokens_sentences_pair(tokens_q_copy, tokens_r)
                assert len(u_tokens_types) == len(tokens_qr)

                u_tokens_types_arr.append(u_tokens_types)
                tokens_qr_arr.append(tokens_qr)
            entry["question"] = tokens_qr_arr
            entry["u_tokens_types"] = u_tokens_types_arr

    def retokenize_and_convert_to_ids(self, _tokens, objects_replace_name):
        parsed_tokens = []
        for mixed_token in _tokens:
            if isinstance(mixed_token, list):
                tokens = [objects_replace_name[o] for o in mixed_token]
                retokenized_tokens = self.tokenizer.tokenize(tokens[0])
                for token, o in zip(tokens[1:], mixed_token[1:]):
                    retokenized_tokens.append('and')
                    
                    re_tokens = self.tokenizer.tokenize(token)
                    retokenized_tokens.extend(re_tokens)
                parsed_tokens.extend(retokenized_tokens)
            else:
                retokenized_tokens = self.tokenizer.tokenize(mixed_token)
                parsed_tokens.extend(retokenized_tokens)

        ids = self.tokenizer.convert_tokens_to_ids(parsed_tokens)
        return ids

    def load_raw_data_Q2A(self, cfg):
        datalist = []
        with open(os.path.join(self.anno_folder, self.stage + '.jsonl'), "rb") as f:
            for annotation in json_lines.reader(f):
                question = annotation["question"]
                image_id = int(annotation["img_id"].split("-")[1])
                anno_id = int(annotation["annot_id"].split("-")[1])
                if self.stage == "test":
                    ans_label = 0
                else:
                    ans_label = annotation["answer_label"]

                datalist.append({
                    "question": question,
                    "img_fn": annotation["img_fn"],
                    "objects":  annotation["objects"],
                    "answers": annotation["answer_choices"],
                    "metadata_fn": annotation["metadata_fn"],
                    "target": ans_label,
                    "image_id": image_id,
                    "anno_id": anno_id,
                })
        return datalist

    def load_raw_data_QA2R(self, cfg):
        datalist = []
        with open(os.path.join(self.anno_folder, self.stage + '.jsonl'), "rb") as f:
            for annotation in json_lines.reader(f):
                if self.stage == "test":
                    for answer in annotation["answer_choices"]:
                        question = annotation["question"] + ["[SEP]"] + answer
                        image_id = int(annotation["img_id"].split("-")[1])
                        anno_id = int(annotation["annot_id"].split("-")[1])
                        datalist.append({
                            "question": question,
                            "img_fn": annotation["img_fn"],
                            "objects":  annotation["objects"],
                            "answers": annotation["rationale_choices"],
                            "metadata_fn": annotation["metadata_fn"],
                            "target": 0,
                            "image_id": image_id,
                            "anno_id": anno_id,
                        })
                else:
                    question = annotation["question"]
                    ans_label = annotation["rationale_label"]
                    image_id = int(annotation["img_id"].split("-")[1])
                    anno_id = int(annotation["annot_id"].split("-")[1])
                    datalist.append({
                        "question": question,
                        "question_a": ["[SEP]"] + annotation["answer_choices"][annotation["answer_label"]],
                        "img_fn": annotation["img_fn"],
                        "objects":  annotation["objects"],
                        "answers": annotation["rationale_choices"],
                        "metadata_fn": annotation["metadata_fn"],
                        "target": ans_label,
                        "image_id": image_id,
                        "anno_id": anno_id,
                    })
        return datalist

    def load_raw_data(self, cfg):
        if self.task_name == "VCR_Q-A":
            return self.load_raw_data_Q2A(cfg)
        elif self.task_name == "VCR_QA-R":
            return self.load_raw_data_QA2R(cfg)
        else:
            raise ValueError(f"task_name should be VCR_Q-A or VCR_QA-R")

    def truncate_seq_pair(self, tokens_q, tokens_a, max_length):
        while len(tokens_a) + len(tokens_q) > max_length:
            if len(tokens_a) > len(tokens_q):
                tokens_a.pop()
            else:
                tokens_q.pop()

    def truncate_seq_tri(self, tokens_q, tokens_a, tokens_r, max_length):
        while len(tokens_q) + len(tokens_a) + len(tokens_r) > max_length:
            if len(tokens_r) > (len(tokens_q) + len(tokens_a)):
                tokens_r.pop()
            elif len(tokens_q) > 1:
                tokens_q.pop()
            else:
                tokens_a.pop()          

[docs]    def __call__(self, dataset_dict):
        dataset_dict = copy.deepcopy(dataset_dict)
        img_query = dataset_dict["metadata_fn"][:-5]

        prob = random.random()
        if prob > 0.5 and self.stage == 'train':
            image_path = os.path.join(self.feats_folder + "_mirror", img_query + ".npz")
            gt_image_path = os.path.join(self.gt_feat_folder + "_mirror", img_query + ".npz")
        else:
            image_path = os.path.join(self.feats_folder, img_query + ".npz")
            gt_image_path = os.path.join(self.gt_feat_folder, img_query + ".npz")

        features, image_locations = read_np_bbox(image_path, self.max_feat_num, use_global_v=self.use_global_v)
        gt_features, gt_image_locations = read_np_bbox(gt_image_path, self.max_feat_num, use_global_v=self.use_global_v)

        num_boxes = features.shape[0]
        gt_num_boxes = gt_features.shape[0]
        # NOTE: if you use global v feat, then you need to merge
        if self.use_global_v:
            # merge two features.
            features[0] = (features[0] * num_boxes + gt_features[0] * gt_num_boxes) / (
                num_boxes + gt_num_boxes
            )

            # merge two boxes, and assign the labels.
            gt_boxes = gt_image_locations[1:gt_num_boxes]
            gt_features = gt_features[1:gt_num_boxes]
            gt_num_boxes = gt_num_boxes - 1
            gt_box_preserve = min(self.max_feat_num - 1, gt_num_boxes)
        else:
            gt_boxes = gt_image_locations
            gt_box_preserve = min(self.max_feat_num, gt_num_boxes)

        gt_boxes = gt_boxes[:gt_box_preserve]
        gt_features = gt_features[:gt_box_preserve]
        gt_num_boxes = gt_box_preserve

        num_box_preserve = min(self.max_feat_num - gt_num_boxes, num_boxes)
        boxes = image_locations[:num_box_preserve]
        features = features[:num_box_preserve]

        # concatenate the boxes
        mix_boxes = np.concatenate((boxes, gt_boxes), axis=0)
        mix_features = np.concatenate((features, gt_features), axis=0)

        questions = [np.array(question).astype(np.int64) for question in dataset_dict["question"]]
        u_tokens_types = [np.array(u_tokens_type).astype(np.int64) for u_tokens_type in dataset_dict["u_tokens_types"]]

        ret = {
            kfg.IDS: str(dataset_dict["anno_id"]),
            kfg.SEQ_PER_SAMPLE: self.seq_per_img,
            kfg.ATT_FEATS: mix_features.astype('float32'),
            kfg.ATT_FEATS_LOC: mix_boxes.astype('float32'),
            kfg.U_TOKENS_IDS: questions,
            kfg.U_TOKENS_TYPE: u_tokens_types,
            kfg.U_TARGET_IDS: np.array([dataset_dict["target"]])
        }
        dict_as_tensor(ret)
        return ret