Add new SentenceTransformer model.
Browse files- README.md +197 -140
- config.json +1 -1
- config_sentence_transformers.json +1 -1
- model.safetensors +1 -1
README.md
CHANGED
@@ -7,117 +7,180 @@ tags:
|
|
7 |
- sentence-similarity
|
8 |
- feature-extraction
|
9 |
- generated_from_trainer
|
10 |
-
- dataset_size:
|
11 |
- loss:MultipleNegativesRankingLoss
|
12 |
widget:
|
13 |
-
- source_sentence:
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
26 |
|
|
|
27 |
|
28 |
-
|
29 |
-
each number to 1 significant figure.
|
30 |
|
31 |
|
32 |
-
|
33 |
|
34 |
-
0.5841 x 36.3
|
35 |
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
|
39 |
-
|
|
|
|
|
40 |
|
41 |
|
42 |
Options:
|
43 |
|
44 |
-
A.
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
|
48 |
-
C. 0.6
|
49 |
|
50 |
-
|
51 |
|
52 |
|
53 |
-
Answer:
|
|
|
|
|
|
|
|
|
54 |
sentences:
|
55 |
-
-
|
56 |
-
|
57 |
-
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
sentences:
|
67 |
-
-
|
68 |
-
-
|
69 |
-
- Does not
|
70 |
-
|
|
|
|
|
|
|
71 |
|
|
|
72 |
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
|
76 |
Options:
|
77 |
|
78 |
-
A.
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
B. 2,986,030,000
|
81 |
|
82 |
-
|
83 |
|
84 |
-
D. 308,603
|
85 |
|
|
|
86 |
|
87 |
-
|
|
|
|
|
88 |
sentences:
|
89 |
-
-
|
90 |
-
-
|
91 |
-
- When
|
92 |
-
|
93 |
-
- source_sentence: 'Construct:
|
|
|
94 |
|
95 |
|
96 |
-
Question:
|
97 |
-
represents his hits and misses?
|
98 |
|
99 |
|
100 |
Options:
|
101 |
|
102 |
-
A.
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
-
|
106 |
-
with 1/3
|
107 |
|
108 |
-
C. Pie chart showing hits, in white, with 320 degrees of the chart and misses,
|
109 |
-
highlighted in red, with 40 degrees.
|
110 |
|
111 |
-
|
112 |
-
and misses, in white, with just under 2/3
|
113 |
|
114 |
|
115 |
-
|
116 |
-
red, with 1/3'
|
117 |
sentences:
|
118 |
-
-
|
119 |
-
-
|
120 |
-
-
|
121 |
---
|
122 |
|
123 |
# SentenceTransformer based on Alibaba-NLP/gte-base-en-v1.5
|
@@ -170,9 +233,9 @@ from sentence_transformers import SentenceTransformer
|
|
170 |
model = SentenceTransformer("Gurveer05/gte-base-eedi-2024")
|
171 |
# Run inference
|
172 |
sentences = [
|
173 |
-
'Construct:
|
174 |
-
'
|
175 |
-
'
|
176 |
]
|
177 |
embeddings = model.encode(sentences)
|
178 |
print(embeddings.shape)
|
@@ -227,19 +290,19 @@ You can finetune this model on your own dataset.
|
|
227 |
#### csv
|
228 |
|
229 |
* Dataset: csv
|
230 |
-
* Size:
|
231 |
-
* Columns: <code>qa_pair_text</code
|
232 |
* Approximate statistics based on the first 1000 samples:
|
233 |
-
| | qa_pair_text
|
234 |
-
|
235 |
-
| type | string
|
236 |
-
| details | <ul><li>min:
|
237 |
* Samples:
|
238 |
-
| qa_pair_text
|
239 |
-
|
240 |
-
| <code>Construct:
|
241 |
-
| <code>Construct:
|
242 |
-
| <code>Construct:
|
243 |
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
244 |
```json
|
245 |
{
|
@@ -253,19 +316,19 @@ You can finetune this model on your own dataset.
|
|
253 |
#### csv
|
254 |
|
255 |
* Dataset: csv
|
256 |
-
* Size:
|
257 |
-
* Columns: <code>qa_pair_text</code
|
258 |
* Approximate statistics based on the first 1000 samples:
|
259 |
-
| | qa_pair_text
|
260 |
-
|
261 |
-
| type | string
|
262 |
-
| details | <ul><li>min:
|
263 |
* Samples:
|
264 |
-
| qa_pair_text
|
265 |
-
|
266 |
-
| <code>Construct:
|
267 |
-
| <code>Construct:
|
268 |
-
| <code>Construct:
|
269 |
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
270 |
```json
|
271 |
{
|
@@ -278,15 +341,18 @@ You can finetune this model on your own dataset.
|
|
278 |
#### Non-Default Hyperparameters
|
279 |
|
280 |
- `eval_strategy`: steps
|
281 |
-
- `
|
282 |
-
- `
|
|
|
283 |
- `weight_decay`: 0.01
|
284 |
- `num_train_epochs`: 20
|
285 |
-
- `lr_scheduler_type`:
|
286 |
- `lr_scheduler_kwargs`: {'num_cycles': 10}
|
287 |
- `warmup_ratio`: 0.1
|
288 |
- `fp16`: True
|
289 |
- `load_best_model_at_end`: True
|
|
|
|
|
290 |
- `batch_sampler`: no_duplicates
|
291 |
|
292 |
#### All Hyperparameters
|
@@ -296,14 +362,14 @@ You can finetune this model on your own dataset.
|
|
296 |
- `do_predict`: False
|
297 |
- `eval_strategy`: steps
|
298 |
- `prediction_loss_only`: True
|
299 |
-
- `per_device_train_batch_size`:
|
300 |
-
- `per_device_eval_batch_size`:
|
301 |
- `per_gpu_train_batch_size`: None
|
302 |
- `per_gpu_eval_batch_size`: None
|
303 |
-
- `gradient_accumulation_steps`:
|
304 |
- `eval_accumulation_steps`: None
|
305 |
- `torch_empty_cache_steps`: None
|
306 |
-
- `learning_rate`: 5e-
|
307 |
- `weight_decay`: 0.01
|
308 |
- `adam_beta1`: 0.9
|
309 |
- `adam_beta2`: 0.999
|
@@ -311,7 +377,7 @@ You can finetune this model on your own dataset.
|
|
311 |
- `max_grad_norm`: 1.0
|
312 |
- `num_train_epochs`: 20
|
313 |
- `max_steps`: -1
|
314 |
-
- `lr_scheduler_type`:
|
315 |
- `lr_scheduler_kwargs`: {'num_cycles': 10}
|
316 |
- `warmup_ratio`: 0.1
|
317 |
- `warmup_steps`: 0
|
@@ -376,8 +442,8 @@ You can finetune this model on your own dataset.
|
|
376 |
- `hub_strategy`: every_save
|
377 |
- `hub_private_repo`: False
|
378 |
- `hub_always_push`: False
|
379 |
-
- `gradient_checkpointing`:
|
380 |
-
- `gradient_checkpointing_kwargs`:
|
381 |
- `include_inputs_for_metrics`: False
|
382 |
- `eval_do_concat_batches`: True
|
383 |
- `fp16_backend`: auto
|
@@ -407,51 +473,42 @@ You can finetune this model on your own dataset.
|
|
407 |
</details>
|
408 |
|
409 |
### Training Logs
|
410 |
-
| Epoch
|
411 |
-
|
412 |
-
| 0.
|
413 |
-
| 1.
|
414 |
-
| 1.
|
415 |
-
| 2.
|
416 |
-
| 2
|
417 |
-
| 3.
|
418 |
-
|
|
419 |
-
| 4.
|
420 |
-
|
|
421 |
-
|
|
422 |
-
|
|
423 |
-
|
|
424 |
-
|
|
425 |
-
|
|
426 |
-
|
|
427 |
-
|
|
428 |
-
|
|
429 |
-
|
|
430 |
-
|
|
431 |
-
|
|
432 |
-
|
|
433 |
-
|
|
434 |
-
|
|
435 |
-
|
|
436 |
-
|
|
437 |
-
|
|
438 |
-
|
|
439 |
-
| 14.0825 | 168 | 0.4 | 0.6934 |
|
440 |
-
| 14.5855 | 174 | 0.3568 | - |
|
441 |
-
| 15.0884 | 180 | 0.3634 | 0.6852 |
|
442 |
-
| 15.5914 | 186 | 0.3412 | - |
|
443 |
-
| 16.0943 | 192 | 0.3374 | 0.6702 |
|
444 |
-
| 16.5972 | 198 | 0.3127 | - |
|
445 |
-
| 17.1002 | 204 | 0.3235 | 0.6611 |
|
446 |
-
| 17.6031 | 210 | 0.2903 | - |
|
447 |
-
| **18.1061** | **216** | **0.2943** | **0.6571** |
|
448 |
|
449 |
* The bold row denotes the saved checkpoint.
|
450 |
|
451 |
### Framework Versions
|
452 |
- Python: 3.10.14
|
453 |
- Sentence Transformers: 3.1.1
|
454 |
-
- Transformers: 4.44.
|
455 |
- PyTorch: 2.4.0
|
456 |
- Accelerate: 0.33.0
|
457 |
- Datasets: 2.19.2
|
|
|
7 |
- sentence-similarity
|
8 |
- feature-extraction
|
9 |
- generated_from_trainer
|
10 |
+
- dataset_size:2442
|
11 |
- loss:MultipleNegativesRankingLoss
|
12 |
widget:
|
13 |
+
- source_sentence: 'Construct: Interpret sloping linear sections of a displacement-time
|
14 |
+
graph.
|
15 |
+
|
16 |
+
|
17 |
+
Question: This graph shows how far Fido the dog is from his home.
|
18 |
+
|
19 |
+
What might the negative-sloping section represent? A graph with time (secs) on
|
20 |
+
the horizontal axis and distance (m) on the vertical axis. The graph starts at
|
21 |
+
the origin, travels in a straight line up and right, travels horizontally, then
|
22 |
+
travels in a straight line down and right back to the x-axis.
|
23 |
+
|
24 |
+
|
25 |
+
Options:
|
26 |
+
|
27 |
+
A. Fido is walking home
|
28 |
+
|
29 |
+
B. Fido has fallen asleep
|
30 |
|
31 |
+
C. Fido is accelerating
|
32 |
|
33 |
+
D. Fido is walking away from home
|
|
|
34 |
|
35 |
|
36 |
+
Correct Answer: Fido is walking home
|
37 |
|
|
|
38 |
|
39 |
+
Incorrect Answer: Fido is walking away from home
|
40 |
+
|
41 |
+
|
42 |
+
Predicted Misconception: Negative slope indicates movement away, not towards a
|
43 |
+
starting point.'
|
44 |
+
sentences:
|
45 |
+
- Does not realise you can use equivalent fractions to break fractions up into smaller
|
46 |
+
divisions
|
47 |
+
- Divides by the order of the root
|
48 |
+
- Believes a downward slope on a distance-time graph means travelling away
|
49 |
+
- source_sentence: 'Construct: Identify reflex angles.
|
50 |
|
51 |
|
52 |
+
Question: An angle measures 192^degree .
|
53 |
+
|
54 |
+
This means it is...
|
55 |
|
56 |
|
57 |
Options:
|
58 |
|
59 |
+
A. Acute
|
60 |
+
|
61 |
+
B. Obtuse
|
62 |
+
|
63 |
+
C. Reflex
|
64 |
|
65 |
+
D. A right angle
|
66 |
|
|
|
67 |
|
68 |
+
Correct Answer: Reflex
|
69 |
|
70 |
|
71 |
+
Incorrect Answer: Obtuse
|
72 |
+
|
73 |
+
|
74 |
+
Predicted Misconception: Believing an angle greater than 180 degrees but less
|
75 |
+
than 360 degrees is obtuse.'
|
76 |
sentences:
|
77 |
+
- Multiplies rather than divides
|
78 |
+
- Confuses factors and multiples
|
79 |
+
- Does not understand that an obtuse angle is between 90 and 180 degrees
|
80 |
+
- source_sentence: 'Construct: Solve quadratic equations using factorisation in the
|
81 |
+
form (x + a)(x + b).
|
82 |
+
|
83 |
+
|
84 |
+
Question: In which region would x^2-10 x-25=0 belong? A Venn diagram made up
|
85 |
+
of two overlapping circles. One is labelled ''Factorises'' and the other is labelled
|
86 |
+
''Has one solution equal to 0''.
|
87 |
+
|
88 |
+
A is in the ''Factorises'' circle only, B is in the overlap of the two circles,
|
89 |
+
C is in the ''Has one solution equal to 0'' circle only, and D is outside the
|
90 |
+
circles.
|
91 |
+
|
92 |
+
|
93 |
+
Options:
|
94 |
+
|
95 |
+
A. A
|
96 |
+
|
97 |
+
B. B
|
98 |
+
|
99 |
+
C. C
|
100 |
+
|
101 |
+
D. D
|
102 |
+
|
103 |
+
|
104 |
+
Correct Answer: D
|
105 |
+
|
106 |
+
|
107 |
+
Incorrect Answer: C
|
108 |
+
|
109 |
+
|
110 |
+
Predicted Misconception: Equating factorization with having a solution equal to
|
111 |
+
zero.'
|
112 |
sentences:
|
113 |
+
- Believes all quadratic equations have a solution of 0
|
114 |
+
- Believes order of operations does not affect the answer to a calculation
|
115 |
+
- Does not realise that the sum of the two shorter sides must be greater than the
|
116 |
+
third side for it to be a possible triangle
|
117 |
+
- source_sentence: 'Construct: Factorise a quadratic expression in the form ax² +
|
118 |
+
bx + c where a is prime.
|
119 |
+
|
120 |
|
121 |
+
Question: Step 1: Factorise the following expression
|
122 |
|
123 |
+
|
124 |
+
(
|
125 |
+
|
126 |
+
3 x^2+5 x+2
|
127 |
+
|
128 |
+
).
|
129 |
|
130 |
|
131 |
Options:
|
132 |
|
133 |
+
A. (3 x+2)(3 x+1)
|
134 |
+
|
135 |
+
B. (3 x+2)(x+1)
|
136 |
+
|
137 |
+
C. Cannot be factorised
|
138 |
+
|
139 |
+
D. (3 x+1)(x+2)
|
140 |
|
|
|
141 |
|
142 |
+
Correct Answer: (3 x+2)(x+1)
|
143 |
|
|
|
144 |
|
145 |
+
Incorrect Answer: (3 x+2)(3 x+1)
|
146 |
|
147 |
+
|
148 |
+
Predicted Misconception: Belief that all quadratic expressions with prime coefficients
|
149 |
+
can only be factorized using prime numbers in both factors.'
|
150 |
sentences:
|
151 |
+
- Does not divide by 2 when calculating the area of a trapezium
|
152 |
+
- Mixes up squaring and multiplying by 2 or doubling
|
153 |
+
- When factorising a quadratic with a non-unit coefficient of x squared, believes
|
154 |
+
that coefficient will be in front of both x terms in the factorised form
|
155 |
+
- source_sentence: 'Construct: Substitute negative integer values into expressions
|
156 |
+
involving no powers or roots.
|
157 |
|
158 |
|
159 |
+
Question: If d=-2 what is the value of 10-2 d ?
|
|
|
160 |
|
161 |
|
162 |
Options:
|
163 |
|
164 |
+
A. -12
|
165 |
+
|
166 |
+
B. 6
|
167 |
+
|
168 |
+
C. 14
|
169 |
+
|
170 |
+
D. 32
|
171 |
+
|
172 |
|
173 |
+
Correct Answer: 14
|
|
|
174 |
|
|
|
|
|
175 |
|
176 |
+
Incorrect Answer: 6
|
|
|
177 |
|
178 |
|
179 |
+
Predicted Misconception: Incorrectly subtracting instead of multiplying the variable.'
|
|
|
180 |
sentences:
|
181 |
+
- Believes multiplying two negatives gives a negative answer
|
182 |
+
- Includes the x variable when giving the equation of a horizontal line
|
183 |
+
- Believes multiplying two negatives gives a negative answer
|
184 |
---
|
185 |
|
186 |
# SentenceTransformer based on Alibaba-NLP/gte-base-en-v1.5
|
|
|
233 |
model = SentenceTransformer("Gurveer05/gte-base-eedi-2024")
|
234 |
# Run inference
|
235 |
sentences = [
|
236 |
+
'Construct: Substitute negative integer values into expressions involving no powers or roots.\n\nQuestion: If d=-2 what is the value of 10-2 d ?\n\nOptions:\nA. -12\nB. 6\nC. 14\nD. 32\n\nCorrect Answer: 14\n\nIncorrect Answer: 6\n\nPredicted Misconception: Incorrectly subtracting instead of multiplying the variable.',
|
237 |
+
'Believes multiplying two negatives gives a negative answer',
|
238 |
+
'Believes multiplying two negatives gives a negative answer',
|
239 |
]
|
240 |
embeddings = model.encode(sentences)
|
241 |
print(embeddings.shape)
|
|
|
290 |
#### csv
|
291 |
|
292 |
* Dataset: csv
|
293 |
+
* Size: 2,442 training samples
|
294 |
+
* Columns: <code>qa_pair_text</code> and <code>MisconceptionName</code>
|
295 |
* Approximate statistics based on the first 1000 samples:
|
296 |
+
| | qa_pair_text | MisconceptionName |
|
297 |
+
|:--------|:-------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
|
298 |
+
| type | string | string |
|
299 |
+
| details | <ul><li>min: 57 tokens</li><li>mean: 121.87 tokens</li><li>max: 621 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 15.09 tokens</li><li>max: 40 tokens</li></ul> |
|
300 |
* Samples:
|
301 |
+
| qa_pair_text | MisconceptionName |
|
302 |
+
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------|
|
303 |
+
| <code>Construct: Solve two-step linear equations, with the variable on one side, with all positive integers.<br><br>Question: Tom and Katie are discussing how to solve:<br>(<br>(8 x / 5)=40<br>)<br><br>Tom says a correct next line of working could be: 8 x = 8 <br>Katie says a correct next line of working could be: (x / 5)=5 <br>Who is correct?<br><br>Options:<br>A. Only<br>Tom<br>B. Only Katie<br>C. Both Tom and Katie<br>D. Neither is correct<br><br>Correct Answer: Only Katie<br><br>Incorrect Answer: Neither is correct<br><br>Predicted Misconception: Multiplying both sides by 5 instead of dividing by 8 to isolate x.</code> | <code>When dividing a fraction by an integer, divides both the numerator and denominator by the integer</code> |
|
304 |
+
| <code>Construct: Rearrange a quadratic equation so that it is in the correct form to be factorised.<br><br>Question: What would be the most useful first step if we wanted to solve the following quadratic equation? x^2+7 x = 8.<br><br>Options:<br>A. Divide by x<br>B. Square root both sides of the equation<br>C. Subtract 7 x from both sides of the equation<br>D. Subtract 8 from both sides of the equation<br><br>Correct Answer: Subtract 8 from both sides of the equation<br><br>Incorrect Answer: Subtract 7 x from both sides of the equation<br><br>Predicted Misconception: Subtracting terms incorrectly from both sides to isolate the constant term.</code> | <code>Does not realise a quadratic must be in the form ax^2+bx+c=0 to be factorised</code> |
|
305 |
+
| <code>Construct: Multiply proper fractions in the form: Fraction × Integer.<br><br>Question: (1 / 2) x 3=.<br><br>Options:<br>A. (3 / 2)<br>B. 3 (1 / 2)<br>C. (3 / 6)<br>D. (1 / 6)<br><br>Correct Answer: (3 / 2)<br><br>Incorrect Answer: (3 / 6)<br><br>Predicted Misconception: Multiplying a fraction by an integer results in a larger fraction, not a smaller one.</code> | <code>When multiplying fractions, multiplies both the numerator and denominator</code> |
|
306 |
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
307 |
```json
|
308 |
{
|
|
|
316 |
#### csv
|
317 |
|
318 |
* Dataset: csv
|
319 |
+
* Size: 1,928 evaluation samples
|
320 |
+
* Columns: <code>qa_pair_text</code> and <code>MisconceptionName</code>
|
321 |
* Approximate statistics based on the first 1000 samples:
|
322 |
+
| | qa_pair_text | MisconceptionName |
|
323 |
+
|:--------|:--------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
|
324 |
+
| type | string | string |
|
325 |
+
| details | <ul><li>min: 52 tokens</li><li>mean: 125.81 tokens</li><li>max: 1093 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 14.45 tokens</li><li>max: 39 tokens</li></ul> |
|
326 |
* Samples:
|
327 |
+
| qa_pair_text | MisconceptionName |
|
328 |
+
|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|
|
329 |
+
| <code>Construct: Calculate the square root of a number.<br><br>Question: What is the square root of nine?<br><br>Options:<br>A. 3<br>B. 4.5<br>C. 81<br>D. 18<br><br>Correct Answer: 3<br><br>Incorrect Answer: 4.5<br><br>Predicted Misconception: Believing square roots result in non-integer values when the original number is a perfect square.</code> | <code>Halves when asked to find the square root</code> |
|
330 |
+
| <code>Construct: Read values off a real life graph.<br><br>Question: A linear graph showing that 10 miles = £8. The graph can be used to work out how much Kay's company pays her for travel.<br><br>Kay's company paid her £ 80 <br><br>How many miles did she travel?<br><br>Options:<br>A. 96<br>B. 100<br>C. 64<br>D. 80<br><br>Correct Answer: 100<br><br>Incorrect Answer: 80<br><br>Predicted Misconception: Assuming a direct correlation without calculating the proportional relationship between miles and cost.</code> | <code>Believes direct proportion means a 1:1 ratio</code> |
|
331 |
+
| <code>Construct: Calculate compound area involving just rectangles and squares, where the dimensions are given in the same units.<br><br>Question: What is the area of this compound shape made with rectangles? Compound shape made of two rectangles with the sides labelled 15 cm, 12 cm, 7 cm and 7 cm. Two sides are unlabelled.<br><br>Options:<br>A. 124 cm^2<br>B. 154 cm^2<br>C. 180 cm^2<br>D. 189 cm^2<br><br>Correct Answer: 124 cm^2<br><br>Incorrect Answer: 154 cm^2<br><br>Predicted Misconception: Incorrectly calculating the area of one rectangle and adding it to the other without considering the overlapping section.</code> | <code>Makes an assumption about line segments being equal within a shape</code> |
|
332 |
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
333 |
```json
|
334 |
{
|
|
|
341 |
#### Non-Default Hyperparameters
|
342 |
|
343 |
- `eval_strategy`: steps
|
344 |
+
- `per_device_train_batch_size`: 64
|
345 |
+
- `per_device_eval_batch_size`: 64
|
346 |
+
- `gradient_accumulation_steps`: 8
|
347 |
- `weight_decay`: 0.01
|
348 |
- `num_train_epochs`: 20
|
349 |
+
- `lr_scheduler_type`: cosine
|
350 |
- `lr_scheduler_kwargs`: {'num_cycles': 10}
|
351 |
- `warmup_ratio`: 0.1
|
352 |
- `fp16`: True
|
353 |
- `load_best_model_at_end`: True
|
354 |
+
- `gradient_checkpointing`: True
|
355 |
+
- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
|
356 |
- `batch_sampler`: no_duplicates
|
357 |
|
358 |
#### All Hyperparameters
|
|
|
362 |
- `do_predict`: False
|
363 |
- `eval_strategy`: steps
|
364 |
- `prediction_loss_only`: True
|
365 |
+
- `per_device_train_batch_size`: 64
|
366 |
+
- `per_device_eval_batch_size`: 64
|
367 |
- `per_gpu_train_batch_size`: None
|
368 |
- `per_gpu_eval_batch_size`: None
|
369 |
+
- `gradient_accumulation_steps`: 8
|
370 |
- `eval_accumulation_steps`: None
|
371 |
- `torch_empty_cache_steps`: None
|
372 |
+
- `learning_rate`: 5e-05
|
373 |
- `weight_decay`: 0.01
|
374 |
- `adam_beta1`: 0.9
|
375 |
- `adam_beta2`: 0.999
|
|
|
377 |
- `max_grad_norm`: 1.0
|
378 |
- `num_train_epochs`: 20
|
379 |
- `max_steps`: -1
|
380 |
+
- `lr_scheduler_type`: cosine
|
381 |
- `lr_scheduler_kwargs`: {'num_cycles': 10}
|
382 |
- `warmup_ratio`: 0.1
|
383 |
- `warmup_steps`: 0
|
|
|
442 |
- `hub_strategy`: every_save
|
443 |
- `hub_private_repo`: False
|
444 |
- `hub_always_push`: False
|
445 |
+
- `gradient_checkpointing`: True
|
446 |
+
- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
|
447 |
- `include_inputs_for_metrics`: False
|
448 |
- `eval_do_concat_batches`: True
|
449 |
- `fp16_backend`: auto
|
|
|
473 |
</details>
|
474 |
|
475 |
### Training Logs
|
476 |
+
| Epoch | Step | Training Loss | loss |
|
477 |
+
|:--------:|:------:|:-------------:|:----------:|
|
478 |
+
| 0.8 | 2 | 2.6135 | - |
|
479 |
+
| 1.2 | 3 | - | 1.0560 |
|
480 |
+
| 1.6 | 4 | 2.058 | - |
|
481 |
+
| 2.4 | 6 | 1.7173 | 0.8711 |
|
482 |
+
| 3.2 | 8 | 1.5537 | - |
|
483 |
+
| 3.6 | 9 | - | 0.7901 |
|
484 |
+
| 4.0 | 10 | 1.4489 | - |
|
485 |
+
| 4.8 | 12 | 1.4622 | 0.7437 |
|
486 |
+
| 5.6 | 14 | 1.2437 | - |
|
487 |
+
| 6.0 | 15 | - | 0.7079 |
|
488 |
+
| 6.4 | 16 | 1.1761 | - |
|
489 |
+
| 7.2 | 18 | 1.0282 | 0.6748 |
|
490 |
+
| 8.0 | 20 | 0.9983 | - |
|
491 |
+
| 8.4 | 21 | - | 0.6437 |
|
492 |
+
| 8.8 | 22 | 0.9676 | - |
|
493 |
+
| 9.6 | 24 | 0.8342 | 0.6169 |
|
494 |
+
| 10.4 | 26 | 0.7937 | - |
|
495 |
+
| 10.8 | 27 | - | 0.5950 |
|
496 |
+
| 11.2 | 28 | 0.6869 | - |
|
497 |
+
| 12.0 | 30 | 0.6558 | 0.5807 |
|
498 |
+
| 12.8 | 32 | 0.6286 | - |
|
499 |
+
| 13.2 | 33 | - | 0.5732 |
|
500 |
+
| 13.6 | 34 | 0.5468 | - |
|
501 |
+
| **14.4** | **36** | **0.4923** | **0.5694** |
|
502 |
+
| 15.2 | 38 | 0.4477 | - |
|
503 |
+
| 15.6 | 39 | - | 0.5727 |
|
504 |
+
| 16.0 | 40 | 0.4108 | - |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
|
506 |
* The bold row denotes the saved checkpoint.
|
507 |
|
508 |
### Framework Versions
|
509 |
- Python: 3.10.14
|
510 |
- Sentence Transformers: 3.1.1
|
511 |
+
- Transformers: 4.44.0
|
512 |
- PyTorch: 2.4.0
|
513 |
- Accelerate: 0.33.0
|
514 |
- Datasets: 2.19.2
|
config.json
CHANGED
@@ -36,7 +36,7 @@
|
|
36 |
},
|
37 |
"rope_theta": 500000,
|
38 |
"torch_dtype": "float32",
|
39 |
-
"transformers_version": "4.44.
|
40 |
"type_vocab_size": 0,
|
41 |
"unpad_inputs": false,
|
42 |
"use_memory_efficient_attention": false,
|
|
|
36 |
},
|
37 |
"rope_theta": 500000,
|
38 |
"torch_dtype": "float32",
|
39 |
+
"transformers_version": "4.44.0",
|
40 |
"type_vocab_size": 0,
|
41 |
"unpad_inputs": false,
|
42 |
"use_memory_efficient_attention": false,
|
config_sentence_transformers.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"__version__": {
|
3 |
"sentence_transformers": "3.1.1",
|
4 |
-
"transformers": "4.44.
|
5 |
"pytorch": "2.4.0"
|
6 |
},
|
7 |
"prompts": {},
|
|
|
1 |
{
|
2 |
"__version__": {
|
3 |
"sentence_transformers": "3.1.1",
|
4 |
+
"transformers": "4.44.0",
|
5 |
"pytorch": "2.4.0"
|
6 |
},
|
7 |
"prompts": {},
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 547119128
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:665965611c704f9f1f96ba10bbd209db8453d86adc1022de8aa4c91eb0e2eaca
|
3 |
size 547119128
|