Code for Visual Question Answering with Transformers in Python Tutorial

running_blip_git.py

# -*- coding: utf-8 -*-
"""VisualQuestionAnswering_PythonCodeTutorial.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1dM89DgL_hg4K3uiKnTQ-p8rtS05wH_fX
"""

!pip install -qU transformers

"""# BLIP

- https://github.com/huggingface/transformers/blob/main/src/transformers/models/blip/modeling_blip.py
- https://huggingface.co/Salesforce/blip-vqa-base/tree/main
"""

import requests
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering
import torch

# load the image we will test BLIP on
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
image

# load necessary components: the processor and the model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

def get_answer_blip(model, processor, image, question):
    """Answers the given question and handles all the preprocessing and postprocessing steps"""
    # preprocess the given image and question
    inputs = processor(image, question, return_tensors="pt")
    # generate the answer (get output)
    out = model.generate(**inputs)
    # post-process the output to get human friendly english text
    print(processor.decode(out[0], skip_special_tokens=True))
    return

# sample question 1
question = "how many dogs are in the picture?"
get_answer_blip(model, processor, image, question)

# sample question 2
question = "how will you describe the picture?"
get_answer_blip(model, processor, image, question)

# sample question 3
question = "where are they?"
get_answer_blip(model, processor, image, question)

# sample question 4
question = "What are they doing?"
get_answer_blip(model, processor, image, question)

# sample question 5
question = "What the dog is wearing?"
get_answer_blip(model, processor, image, question)

class BLIP_VQA:
    """Custom implementation of the BLIP model. The code has been adapted from the official transformers implementation"""

    def __init__(self, vision_model, text_encoder, text_decoder, processor):
        """Initialize various objects"""
        self.vision_model = vision_model
        self.text_encoder = text_encoder
        self.text_decoder = text_decoder
        self.processor = processor

    def preprocess(self, img, ques):
        """preprocess the inputs: image, question"""
        # preprocess using the processor
        inputs = self.processor(img, ques, return_tensors='pt')
        # store the pixel values of the image, input IDs (i.e., token IDs) of the question and the attention masks separately
        pixel_values = inputs['pixel_values']
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        return pixel_values, input_ids, attention_mask


    def generate_output(self, pixel_values, input_ids, attention_mask):
        """Generates output from the preprocessed input"""

        # get the vision outputs (i.e., the image embeds)
        vision_outputs = self.vision_model(pixel_values=pixel_values)
        img_embeds = vision_outputs[0]

        # create attention mask with 1s on all the image embedding positions
        img_attention_mask = torch.ones(img_embeds.size()[: -1], dtype=torch.long)

        # encode the questions
        question_outputs = self.text_encoder(input_ids=input_ids,
                                             attention_mask=attention_mask,
                                             encoder_hidden_states=img_embeds,
                                             encoder_attention_mask=img_attention_mask,
                                             return_dict=False)

        # create attention mask with 1s on all the question token IDs positions
        question_embeds = question_outputs[0]
        question_attention_mask = torch.ones(question_embeds.size()[:-1], dtype=torch.long)

        # initialize the answers with the beginning-of-sentence IDs (bos ID)
        bos_ids = torch.full((question_embeds.size(0), 1), fill_value=30522)

        # get output from the decoder. These outputs are the generated IDs
        outputs = self.text_decoder.generate(
            input_ids=bos_ids,
            eos_token_id=102,
            pad_token_id=0,
            encoder_hidden_states=question_embeds,
            encoder_attention_mask=question_attention_mask)

        return outputs


    def postprocess(self, outputs):
        """post-process the output generated by the text-decoder"""

        return self.processor.decode(outputs[0], skip_special_tokens=True)


    def get_answer(self, image, ques):
        """Returns human friendly answer to a question"""

        # preprocess
        pixel_values, input_ids, attention_mask = self.preprocess(image, ques)
        # generate output
        outputs = self.generate_output(pixel_values, input_ids, attention_mask)
        # post-process
        answer = self.postprocess(outputs)
        return answer

blip_vqa = BLIP_VQA(vision_model=model.vision_model,
                    text_encoder=model.text_encoder,
                    text_decoder=model.text_decoder,
                    processor=processor)

# sample question 1
ques = "how will you describe the picture?"
print(blip_vqa.get_answer(image, ques))

# load another image to test BLIP
img_url = "https://fastly.picsum.photos/id/11/200/200.jpg?hmac=LBGO0uEpEmAVS8NeUXMqxcIdHGIcu0JiOb5DJr4mtUI"
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
image

# sample question 1
ques = "Describe the picture"
print(blip_vqa.get_answer(image, ques))

# sample question 2
ques = "What is the major color present?"
print(blip_vqa.get_answer(image, ques))

# sample question 3
ques = "How's the weather?"
print(blip_vqa.get_answer(image, ques))

"""# GIT

- https://github.com/huggingface/transformers/blob/main/src/transformers/models/git/modeling_git.py
- https://huggingface.co/microsoft/git-base-textvqa
"""

!pip install -qU transformers

from transformers import AutoProcessor, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
from PIL import Image

# load the image we will test GIT on
file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
image = Image.open(file_path).convert("RGB")
image

# load necessary components: the processor and the model
processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

class GIT_VQA:
    """Custom implementation of the GIT model for Visual Question Answering (VQA) tasks."""

    def __init__(self, model, processor):
        """Initializes the model and the processor."""
        self.model = model
        self.processor = processor
        return


    def preprocess(self, image, question):
        """Preprocesses the inputs: image, question"""
        # process the image to get pixel values
        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values

        # process the question to get input IDs, but do not add special tokens
        input_ids = self.processor(text=question, add_special_tokens=False).input_ids

        # add the CLS token at the beginning of the input_ids and format for model input
        input_ids = [self.processor.tokenizer.cls_token_id] + input_ids
        input_ids = torch.tensor(input_ids).unsqueeze(0)

        return pixel_values, input_ids


    def generate(self, pixel_values, input_ids):
        """Generates the output from the preprocessed inputs."""

        # generate output using the model with a maximum length of 50 tokens
        outputs = self.model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
        return outputs


    def postprocess(self, outputs):
        """Post-processes the output generated by the model."""

        # decode the output, ignoring special tokens
        answer = self.processor.batch_decode(outputs, skip_special_tokens=True)
        return answer


    def get_answer(self, image, question):
        """Returns human friendly answer to a question"""

        # preprocess
        pixel_values, input_ids = self.preprocess(image, question)
        # generate output
        outputs = self.generate(pixel_values, input_ids)
        # post-process
        answer = self.postprocess(outputs)
        return answer

# create a GIT instance
git_vqa = GIT_VQA(model=model, processor=processor)

# sample question 1
question = "what does the front of the bus say at the top?"
answer = git_vqa.get_answer(image, question)
print(answer)

# sample question 2
question = "what are all the colors present on the bus?"
answer = git_vqa.get_answer(image, question)
print(answer)

# sample question 3
question = "How many wheels you see in the bus?"
answer = git_vqa.get_answer(image, question)
print(answer)

# load another image to test BLIP
img_url = "https://fastly.picsum.photos/id/110/500/500.jpg?hmac=wSHhLFNyJ6k3uM94s6etGQ0WWhmwbdUSiZ9ZDL5Hh2Q"
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
image

# sample question 1
question = "Is it night in the image?"
answer = git_vqa.get_answer(image, question)
print(answer)

running_blip2.py

# %%
!pip install transformers accelerate

# %%
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
import os

device = torch.device("cuda", 0)
device

# %%
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)

# %%
model.to(device)

# %%
import urllib.parse as parse
import os

# a function to determine whether a string is a URL or not
def is_url(string):
    try:
        result = parse.urlparse(string)
        return all([result.scheme, result.netloc, result.path])
    except:
        return False
    
# a function to load an image
def load_image(image_path):
    if is_url(image_path):
        return Image.open(requests.get(image_path, stream=True).raw)
    elif os.path.exists(image_path):
        return Image.open(image_path)

# %%
raw_image = load_image("http://images.cocodataset.org/test-stuff2017/000000007226.jpg")

# %%
question = "a"
inputs = processor(raw_image, question, return_tensors="pt").to(device, dtype=torch.float16)

# %%
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

# %%
question = "a vintage car driving down a street"
inputs = processor(raw_image, question, return_tensors="pt").to(device, dtype=torch.float16)

# %%
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

# %%
question = "Question: What is the estimated year of these cars? Answer:"
inputs = processor(raw_image, question, return_tensors="pt").to(device, dtype=torch.float16)

# %%
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

# %%
question = "Question: What is the color of the car? Answer:"
inputs = processor(raw_image, question, return_tensors="pt").to(device, dtype=torch.float16)

# %%
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

# %%

Ethical Hacking with Python EBook - Resources - Top

New Tutorials

3 Best Online AI Code Generators

How to Validate Credit Card Numbers in Python

How to Make a Clickjacking Vulnerability Scanner with Python

How to Perform Reverse DNS Lookups Using Python

How to Check Password Strength with Python

Code for Visual Question Answering with Transformers in Python Tutorial

Tags

New Tutorials

Popular Tutorials