How to handle Multi Label DataSet from Directory for image captioning in PyTorch












1














I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me



Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.



Here they have used Standard COCO Dataset.



I have dataset as images/ and captions/ directory .



Example



Directory Structure:



images/T001.jpg 
images/T002.jpg
...
...
captions/T001.txt
captions/T002.txt
....
....


The above is the relation. Caption file has 'n' number of captions in each separate line.



I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.



Any guidance/suggestion on how to achieving this.



++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:



from __future__ import print_function
import torch
from torchvision import datasets, models, transforms
from torchvision import transforms
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import torch.nn as nn
#from torch import np
import numpy as np
import utils_c
from data_loader_c import get_cust_data_loader
from models import CNN, RNN
from vocab_custom import Vocabulary, load_vocab
import os

class ImageCaptionDataSet(data.Dataset):
def __init__(self, path, json, vocab=None, transform=None):
self.vocab = vocab
self.transform = transform
self.img_dir_path = path
self.cap_dir_path = json
self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))
self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))
pass

def __getitem__(self,index):
vocab = self.vocab

img_path = self.all_imgs_path[index]
img_base_name = os.path.basename(img_path)
cap_base_name = img_base_name.replace(".jpg",".txt")
cap_path = os.path.join(self.cap_dir_path,cap_base_name)

caption_all_for_a_image = open(cap_path).read().split("n")

image = Image.open(img_path)
image = image.convert('RGB')

if self.transform != None:
# apply image preprocessing
image = self.transform(image)

#captions_combined =
#max_len = 0
#for caption in caption_all_for_a_image:
# caption_str = str(caption).lower()
# tokens = nltk.tokenize.word_tokenize(caption_str)
# m = len(tokens) + 2
# if m>max_len:
# max_len = m
# caption = torch.Tensor([vocab(vocab.start_token())] +
# [vocab(token) for token in tokens] +
# [vocab(vocab.end_token())])
# captions_combined.append(caption)
# #yield image, caption
#return image,torch.Tensor(captions_combined)

caption_str = str(caption_all_for_a_image).lower()
tokens = nltk.tokenize.word_tokenize(caption_str)
caption = torch.Tensor([vocab(vocab.start_token())] +
[vocab(token) for token in tokens] +
[vocab(vocab.end_token())])

return image,caption

def __len__(self):
return len(self.all_imgs_path)


+++++++++++++++++++++++++++++++++










share|improve this question






















  • which of the lines do you want? the first? last? a random one?
    – Shai
    Nov 23 '18 at 9:33










  • Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
    – rajeshkumargp
    Nov 24 '18 at 5:00


















1














I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me



Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.



Here they have used Standard COCO Dataset.



I have dataset as images/ and captions/ directory .



Example



Directory Structure:



images/T001.jpg 
images/T002.jpg
...
...
captions/T001.txt
captions/T002.txt
....
....


The above is the relation. Caption file has 'n' number of captions in each separate line.



I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.



Any guidance/suggestion on how to achieving this.



++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:



from __future__ import print_function
import torch
from torchvision import datasets, models, transforms
from torchvision import transforms
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import torch.nn as nn
#from torch import np
import numpy as np
import utils_c
from data_loader_c import get_cust_data_loader
from models import CNN, RNN
from vocab_custom import Vocabulary, load_vocab
import os

class ImageCaptionDataSet(data.Dataset):
def __init__(self, path, json, vocab=None, transform=None):
self.vocab = vocab
self.transform = transform
self.img_dir_path = path
self.cap_dir_path = json
self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))
self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))
pass

def __getitem__(self,index):
vocab = self.vocab

img_path = self.all_imgs_path[index]
img_base_name = os.path.basename(img_path)
cap_base_name = img_base_name.replace(".jpg",".txt")
cap_path = os.path.join(self.cap_dir_path,cap_base_name)

caption_all_for_a_image = open(cap_path).read().split("n")

image = Image.open(img_path)
image = image.convert('RGB')

if self.transform != None:
# apply image preprocessing
image = self.transform(image)

#captions_combined =
#max_len = 0
#for caption in caption_all_for_a_image:
# caption_str = str(caption).lower()
# tokens = nltk.tokenize.word_tokenize(caption_str)
# m = len(tokens) + 2
# if m>max_len:
# max_len = m
# caption = torch.Tensor([vocab(vocab.start_token())] +
# [vocab(token) for token in tokens] +
# [vocab(vocab.end_token())])
# captions_combined.append(caption)
# #yield image, caption
#return image,torch.Tensor(captions_combined)

caption_str = str(caption_all_for_a_image).lower()
tokens = nltk.tokenize.word_tokenize(caption_str)
caption = torch.Tensor([vocab(vocab.start_token())] +
[vocab(token) for token in tokens] +
[vocab(vocab.end_token())])

return image,caption

def __len__(self):
return len(self.all_imgs_path)


+++++++++++++++++++++++++++++++++










share|improve this question






















  • which of the lines do you want? the first? last? a random one?
    – Shai
    Nov 23 '18 at 9:33










  • Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
    – rajeshkumargp
    Nov 24 '18 at 5:00
















1












1








1







I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me



Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.



Here they have used Standard COCO Dataset.



I have dataset as images/ and captions/ directory .



Example



Directory Structure:



images/T001.jpg 
images/T002.jpg
...
...
captions/T001.txt
captions/T002.txt
....
....


The above is the relation. Caption file has 'n' number of captions in each separate line.



I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.



Any guidance/suggestion on how to achieving this.



++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:



from __future__ import print_function
import torch
from torchvision import datasets, models, transforms
from torchvision import transforms
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import torch.nn as nn
#from torch import np
import numpy as np
import utils_c
from data_loader_c import get_cust_data_loader
from models import CNN, RNN
from vocab_custom import Vocabulary, load_vocab
import os

class ImageCaptionDataSet(data.Dataset):
def __init__(self, path, json, vocab=None, transform=None):
self.vocab = vocab
self.transform = transform
self.img_dir_path = path
self.cap_dir_path = json
self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))
self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))
pass

def __getitem__(self,index):
vocab = self.vocab

img_path = self.all_imgs_path[index]
img_base_name = os.path.basename(img_path)
cap_base_name = img_base_name.replace(".jpg",".txt")
cap_path = os.path.join(self.cap_dir_path,cap_base_name)

caption_all_for_a_image = open(cap_path).read().split("n")

image = Image.open(img_path)
image = image.convert('RGB')

if self.transform != None:
# apply image preprocessing
image = self.transform(image)

#captions_combined =
#max_len = 0
#for caption in caption_all_for_a_image:
# caption_str = str(caption).lower()
# tokens = nltk.tokenize.word_tokenize(caption_str)
# m = len(tokens) + 2
# if m>max_len:
# max_len = m
# caption = torch.Tensor([vocab(vocab.start_token())] +
# [vocab(token) for token in tokens] +
# [vocab(vocab.end_token())])
# captions_combined.append(caption)
# #yield image, caption
#return image,torch.Tensor(captions_combined)

caption_str = str(caption_all_for_a_image).lower()
tokens = nltk.tokenize.word_tokenize(caption_str)
caption = torch.Tensor([vocab(vocab.start_token())] +
[vocab(token) for token in tokens] +
[vocab(vocab.end_token())])

return image,caption

def __len__(self):
return len(self.all_imgs_path)


+++++++++++++++++++++++++++++++++










share|improve this question













I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me



Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.



Here they have used Standard COCO Dataset.



I have dataset as images/ and captions/ directory .



Example



Directory Structure:



images/T001.jpg 
images/T002.jpg
...
...
captions/T001.txt
captions/T002.txt
....
....


The above is the relation. Caption file has 'n' number of captions in each separate line.



I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.



Any guidance/suggestion on how to achieving this.



++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:



from __future__ import print_function
import torch
from torchvision import datasets, models, transforms
from torchvision import transforms
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import torch.nn as nn
#from torch import np
import numpy as np
import utils_c
from data_loader_c import get_cust_data_loader
from models import CNN, RNN
from vocab_custom import Vocabulary, load_vocab
import os

class ImageCaptionDataSet(data.Dataset):
def __init__(self, path, json, vocab=None, transform=None):
self.vocab = vocab
self.transform = transform
self.img_dir_path = path
self.cap_dir_path = json
self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))
self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))
pass

def __getitem__(self,index):
vocab = self.vocab

img_path = self.all_imgs_path[index]
img_base_name = os.path.basename(img_path)
cap_base_name = img_base_name.replace(".jpg",".txt")
cap_path = os.path.join(self.cap_dir_path,cap_base_name)

caption_all_for_a_image = open(cap_path).read().split("n")

image = Image.open(img_path)
image = image.convert('RGB')

if self.transform != None:
# apply image preprocessing
image = self.transform(image)

#captions_combined =
#max_len = 0
#for caption in caption_all_for_a_image:
# caption_str = str(caption).lower()
# tokens = nltk.tokenize.word_tokenize(caption_str)
# m = len(tokens) + 2
# if m>max_len:
# max_len = m
# caption = torch.Tensor([vocab(vocab.start_token())] +
# [vocab(token) for token in tokens] +
# [vocab(vocab.end_token())])
# captions_combined.append(caption)
# #yield image, caption
#return image,torch.Tensor(captions_combined)

caption_str = str(caption_all_for_a_image).lower()
tokens = nltk.tokenize.word_tokenize(caption_str)
caption = torch.Tensor([vocab(vocab.start_token())] +
[vocab(token) for token in tokens] +
[vocab(vocab.end_token())])

return image,caption

def __len__(self):
return len(self.all_imgs_path)


+++++++++++++++++++++++++++++++++







python pytorch






share|improve this question













share|improve this question











share|improve this question




share|improve this question










asked Nov 23 '18 at 7:43









rajeshkumargprajeshkumargp

1616




1616












  • which of the lines do you want? the first? last? a random one?
    – Shai
    Nov 23 '18 at 9:33










  • Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
    – rajeshkumargp
    Nov 24 '18 at 5:00




















  • which of the lines do you want? the first? last? a random one?
    – Shai
    Nov 23 '18 at 9:33










  • Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
    – rajeshkumargp
    Nov 24 '18 at 5:00


















which of the lines do you want? the first? last? a random one?
– Shai
Nov 23 '18 at 9:33




which of the lines do you want? the first? last? a random one?
– Shai
Nov 23 '18 at 9:33












Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
– rajeshkumargp
Nov 24 '18 at 5:00






Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
– rajeshkumargp
Nov 24 '18 at 5:00














1 Answer
1






active

oldest

votes


















1














First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:



cap = ['a sentence', 'bla bla bla']
str(cap)


Returns this sting:




"['a sentence', 'bla bla bla']"



Note that [', and ', ' are part of the resulting string!



You can pick one of the captions at random:



import random
...
cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection





share|improve this answer





















    Your Answer






    StackExchange.ifUsing("editor", function () {
    StackExchange.using("externalEditor", function () {
    StackExchange.using("snippets", function () {
    StackExchange.snippets.init();
    });
    });
    }, "code-snippets");

    StackExchange.ready(function() {
    var channelOptions = {
    tags: "".split(" "),
    id: "1"
    };
    initTagRenderer("".split(" "), "".split(" "), channelOptions);

    StackExchange.using("externalEditor", function() {
    // Have to fire editor after snippets, if snippets enabled
    if (StackExchange.settings.snippets.snippetsEnabled) {
    StackExchange.using("snippets", function() {
    createEditor();
    });
    }
    else {
    createEditor();
    }
    });

    function createEditor() {
    StackExchange.prepareEditor({
    heartbeatType: 'answer',
    autoActivateHeartbeat: false,
    convertImagesToLinks: true,
    noModals: true,
    showLowRepImageUploadWarning: true,
    reputationToPostImages: 10,
    bindNavPrevention: true,
    postfix: "",
    imageUploader: {
    brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
    contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
    allowUrls: true
    },
    onDemand: true,
    discardSelector: ".discard-answer"
    ,immediatelyShowMarkdownHelp:true
    });


    }
    });














    draft saved

    draft discarded


















    StackExchange.ready(
    function () {
    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53442510%2fhow-to-handle-multi-label-dataset-from-directory-for-image-captioning-in-pytorch%23new-answer', 'question_page');
    }
    );

    Post as a guest















    Required, but never shown

























    1 Answer
    1






    active

    oldest

    votes








    1 Answer
    1






    active

    oldest

    votes









    active

    oldest

    votes






    active

    oldest

    votes









    1














    First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:



    cap = ['a sentence', 'bla bla bla']
    str(cap)


    Returns this sting:




    "['a sentence', 'bla bla bla']"



    Note that [', and ', ' are part of the resulting string!



    You can pick one of the captions at random:



    import random
    ...
    cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
    caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection





    share|improve this answer


























      1














      First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:



      cap = ['a sentence', 'bla bla bla']
      str(cap)


      Returns this sting:




      "['a sentence', 'bla bla bla']"



      Note that [', and ', ' are part of the resulting string!



      You can pick one of the captions at random:



      import random
      ...
      cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
      caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection





      share|improve this answer
























        1












        1








        1






        First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:



        cap = ['a sentence', 'bla bla bla']
        str(cap)


        Returns this sting:




        "['a sentence', 'bla bla bla']"



        Note that [', and ', ' are part of the resulting string!



        You can pick one of the captions at random:



        import random
        ...
        cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
        caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection





        share|improve this answer












        First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:



        cap = ['a sentence', 'bla bla bla']
        str(cap)


        Returns this sting:




        "['a sentence', 'bla bla bla']"



        Note that [', and ', ' are part of the resulting string!



        You can pick one of the captions at random:



        import random
        ...
        cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
        caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection






        share|improve this answer












        share|improve this answer



        share|improve this answer










        answered Nov 23 '18 at 9:51









        ShaiShai

        69.3k22135242




        69.3k22135242






























            draft saved

            draft discarded




















































            Thanks for contributing an answer to Stack Overflow!


            • Please be sure to answer the question. Provide details and share your research!

            But avoid



            • Asking for help, clarification, or responding to other answers.

            • Making statements based on opinion; back them up with references or personal experience.


            To learn more, see our tips on writing great answers.





            Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


            Please pay close attention to the following guidance:


            • Please be sure to answer the question. Provide details and share your research!

            But avoid



            • Asking for help, clarification, or responding to other answers.

            • Making statements based on opinion; back them up with references or personal experience.


            To learn more, see our tips on writing great answers.




            draft saved


            draft discarded














            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53442510%2fhow-to-handle-multi-label-dataset-from-directory-for-image-captioning-in-pytorch%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown





















































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown

































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown







            Popular posts from this blog

            Berounka

            Different font size/position of beamer's navigation symbols template's content depending on regular/plain...

            Sphinx de Gizeh