CodeZzz commited on
Commit
90b5e1f
·
1 Parent(s): 9ef6b0f
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ckpts/* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: cogvlm2
4
+ license_link: https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B/blob/main/LICENS
5
+
6
+ language:
7
+ - ens
8
+ pipeline_tag: text-generation
9
+ tags:
10
+ - chat
11
+ - cogvlm2
12
+
13
+ inference: false
14
+ ---
15
+ # VisionReward-Image
16
+
17
+ ## Introduction
18
+ We present VisionReward, a general strategy to aligning visual generation models——both image and video generation——with human preferences through a fine-grainedand multi-dimensional framework. We decompose human preferences in images and videos into multiple dimensions,each represented by a series of judgment questions, linearly weighted and summed to an interpretable and accuratescore. To address the challenges of video quality assess-ment, we systematically analyze various dynamic features of videos, which helps VisionReward surpass VideoScore by 17.2% and achieve top performance for video preference prediction.
19
+ Here, we present the model of VisionReward-Image.
20
+
21
+ ## Merging and Extracting Checkpoint Files
22
+ Use the following command to merge the split files into a single `.tar` file and then extract it into the specified directory:
23
+
24
+ ```sh
25
+ cat ckpts/split_part_* > ckpts/visionreward_image.tar
26
+ tar -xvf ckpts/visionreward_image.tar
27
+ ```
28
+
29
+ ## Using this model
30
+ You can quickly install the Python package dependencies and run model inference in our [github](https://github.com/THUDM/VisionReward).
31
+ > This model utilizes bf16 precision parameters and requires the use of the sat (SwissArmyTransformer) library for invocation. For the fp32 version of the model, please refer to the following link: [https://huggingface.co/THUDM/VisionReward-Image-bf16](https://huggingface.co/THUDM/VisionReward-Image-bf16)
ckpts/split_part_aa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a7a1f3f4998763891d5847f15e9356ee388892824b6931fb57aff9edd172f8b
3
+ size 5221908480
ckpts/split_part_ab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad33f733572698294b3a661b8221bcd0a90f3c92c4345876a26e623d7e42a73b
3
+ size 5221908480
ckpts/split_part_ac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1870cfec4bae857dc658f498c5fd54c5fad2ba994912f2be4b5f182b92f5e07
3
+ size 5221908480
ckpts/split_part_ad ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bde4f729bf0de1cd7519c9b5ce249aaa3a7ddf128722348ee81d710d00fb79a
3
+ size 5221908480
ckpts/split_part_ae ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a9e7ae16eb4558d349c839d3fa4156eb816dc83a0abe5feb9689c6282479b39
3
+ size 5221908480
ckpts/split_part_af ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81dae2cb5af154d578039411843092a605c05df250f4181869de39975fb353e3
3
+ size 5221908480
ckpts/split_part_ag ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4b3c26ed2cbbf28c56bf007701f7a18f42a0fad89ecaba3c9219ab2dc0bcd63
3
+ size 5221908480
ckpts/split_part_ah ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:467f56744f37e49d8b861d98e31c51d6e1ff668a9bede7c62396459d73d4bbdc
3
+ size 2453288960
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ 1
model_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_class": "VisualChatModel",
3
+ "tokenizer_type": "Meta-Llama-3-8B-Instruct",
4
+ "num_layers": 32,
5
+ "hidden_size": 4096,
6
+ "num_attention_heads": 32,
7
+ "vocab_size": 128256,
8
+ "layernorm_order": "pre",
9
+ "model_parallel_size": 1,
10
+ "max_sequence_length": 8192,
11
+ "use_bias": false,
12
+ "inner_hidden_size": 14336,
13
+ "num_multi_query_heads": 8,
14
+ "image_length": 2304,
15
+ "image_size": 1344,
16
+ "eva_args": {
17
+ "model_class": "EVA2CLIPModel",
18
+ "num_layers": 63,
19
+ "hidden_size": 1792,
20
+ "num_attention_heads": 16,
21
+ "vocab_size": 1,
22
+ "layernorm_order": "post",
23
+ "model_parallel_size": 1,
24
+ "max_sequence_length": 257,
25
+ "inner_hidden_size": 15360,
26
+ "use_final_layernorm": false,
27
+ "layernorm_epsilon": 1e-06,
28
+ "row_parallel_linear_final_bias": false,
29
+ "image_size": [
30
+ 1344,
31
+ 1344
32
+ ],
33
+ "pre_len": 1,
34
+ "post_len": 0,
35
+ "in_channels": 3,
36
+ "patch_size": 14
37
+ },
38
+ "bos_token_id": 128000,
39
+ "eos_token_id": 128001,
40
+ "pad_token_id": null
41
+ }