tuman commited on
Commit
5e52925
1 Parent(s): 470bad3

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +76 -4
README.md CHANGED
@@ -1,8 +1,80 @@
1
  ---
2
- license: unlicense
 
 
3
  language:
4
  - ru
5
  metrics:
6
- - rouge
7
- pipeline_tag: image-to-text
8
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - image-to-text
4
+ - image-captioning
5
  language:
6
  - ru
7
  metrics:
8
+ - bleu
9
+ library_name: transformers
10
+ ---
11
+
12
+ # vit-rugpt2-image-captioning
13
+
14
+ This is an image captioning model trained on translated version (en-ru) of dataset COCO2014.
15
+
16
+ # Metrics on test data
17
+
18
+ * Bleu: 8.672
19
+ * Bleu precision 1: 30.567
20
+ * Bleu precision 2: 7.895
21
+ * Bleu precision 3: 3.261
22
+
23
+ # Sample running code
24
+
25
+ ```python
26
+
27
+ from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
28
+ import torch
29
+ from PIL import Image
30
+
31
+ model = VisionEncoderDecoderModel.from_pretrained("vit-rugpt2-image-captioning")
32
+ feature_extractor = ViTFeatureExtractor.from_pretrained("vit-rugpt2-image-captioning")
33
+ tokenizer = AutoTokenizer.from_pretrained("vit-rugpt2-image-captioning")
34
+
35
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
+ model.to(device)
37
+
38
+ max_length = 16
39
+ num_beams = 4
40
+ gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
41
+
42
+ def predict_caption(image_paths):
43
+ images = []
44
+ for image_path in image_paths:
45
+ i_image = Image.open(image_path)
46
+ if i_image.mode != "RGB":
47
+ i_image = i_image.convert(mode="RGB")
48
+
49
+ images.append(i_image)
50
+
51
+ pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
52
+ pixel_values = pixel_values.to(device)
53
+
54
+ output_ids = model.generate(pixel_values, **gen_kwargs)
55
+
56
+ preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
57
+ preds = [pred.strip() for pred in preds]
58
+ return preds
59
+
60
+ predict_caption(['train2014/COCO_train2014_000000295442.jpg']) # ['Самолет на взлетно-посадочной полосе аэропорта.']
61
+
62
+ ```
63
+
64
+ # Sample running code using transformers pipeline
65
+
66
+ ```python
67
+
68
+ from transformers import pipeline
69
+
70
+ image_to_text = pipeline("image-to-text", model="vit-rugpt2-image-captioning")
71
+
72
+ image_to_text("train2014/COCO_train2014_000000296754.jpg") # [{'generated_text': 'Человек идет по улице с зонтом.'}]
73
+
74
+ ```
75
+
76
+
77
+ # Contact for any help
78
+ * https://huggingface.co/tuman
79
+ * https://github.com/tumanov-a
80
+ * https://t.me/tumanov_av