Joshua Lochner
commited on
Commit
·
1fc070a
1
Parent(s):
9340261
Update pipeline.py
Browse files- pipeline.py +26 -21
pipeline.py
CHANGED
@@ -280,19 +280,28 @@ def binary_search_above(transcript, start_index, end_index, time):
|
|
280 |
|
281 |
class PreTrainedPipeline():
|
282 |
def __init__(self, path: str):
|
283 |
-
path2 = os.path.join(path,
|
284 |
self.model2 = AutoModelForSequenceClassification.from_pretrained(path2)
|
285 |
self.tokenizer2 = AutoTokenizer.from_pretrained(path2)
|
286 |
self.pipeline2 = SponsorBlockClassificationPipeline(
|
287 |
model=self.model2, tokenizer=self.tokenizer2)
|
288 |
|
289 |
def __call__(self, inputs: str)-> List[Dict[str, Any]]:
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
'
|
294 |
-
'
|
295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
return self.pipeline2(data)
|
297 |
|
298 |
|
@@ -301,26 +310,22 @@ class SponsorBlockClassificationPipeline(TextClassificationPipeline):
|
|
301 |
def __init__(self, model, tokenizer):
|
302 |
super().__init__(model=model, tokenizer=tokenizer, return_all_scores=True)
|
303 |
|
304 |
-
def preprocess(self,
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
-
|
307 |
-
segment_words = extract_segment(words, video['start'], video['end'])
|
308 |
-
text = ' '.join(x['text'] for x in segment_words)
|
309 |
-
|
310 |
-
model_inputs = self.tokenizer(
|
311 |
text, return_tensors=self.framework, **tokenizer_kwargs)
|
312 |
-
return {'video': video, 'model_inputs': model_inputs}
|
313 |
-
|
314 |
-
def _forward(self, data):
|
315 |
-
model_outputs = self.model(**data['model_inputs'])
|
316 |
-
return {'video': data['video'], 'model_outputs': model_outputs}
|
317 |
|
318 |
-
def postprocess(self, data, function_to_apply=None, return_all_scores=False):
|
319 |
-
model_outputs = data['model_outputs']
|
320 |
|
|
|
321 |
results = super().postprocess(model_outputs, function_to_apply, return_all_scores)
|
322 |
|
323 |
for result in results:
|
324 |
result['label_text'] = CATEGORIES[result['label']]
|
325 |
|
326 |
-
return results
|
|
|
280 |
|
281 |
class PreTrainedPipeline():
|
282 |
def __init__(self, path: str):
|
283 |
+
path2 = os.path.join(path, 'model')
|
284 |
self.model2 = AutoModelForSequenceClassification.from_pretrained(path2)
|
285 |
self.tokenizer2 = AutoTokenizer.from_pretrained(path2)
|
286 |
self.pipeline2 = SponsorBlockClassificationPipeline(
|
287 |
model=self.model2, tokenizer=self.tokenizer2)
|
288 |
|
289 |
def __call__(self, inputs: str)-> List[Dict[str, Any]]:
|
290 |
+
|
291 |
+
|
292 |
+
if ' ' not in inputs and inputs.count(',') >= 2: # Automated call (compressed string)
|
293 |
+
split_info = inputs.split(',', 1)
|
294 |
+
times = np.reshape(np.array(split_info[1].split(',')), (-1, 2))
|
295 |
+
data = []
|
296 |
+
for start, end in times:
|
297 |
+
data.append({
|
298 |
+
'video_id': split_info[0],
|
299 |
+
'start': float(start),
|
300 |
+
'end': float(end)
|
301 |
+
})
|
302 |
+
else:
|
303 |
+
data = inputs
|
304 |
+
|
305 |
return self.pipeline2(data)
|
306 |
|
307 |
|
|
|
310 |
def __init__(self, model, tokenizer):
|
311 |
super().__init__(model=model, tokenizer=tokenizer, return_all_scores=True)
|
312 |
|
313 |
+
def preprocess(self, data, **tokenizer_kwargs):
|
314 |
+
if isinstance(data, str): # If string, assume this is what user wants to classify
|
315 |
+
text = data
|
316 |
+
else: # Otherwise, get data from transcript
|
317 |
+
words = get_words(video['video_id'])
|
318 |
+
segment_words = extract_segment(words, video['start'], video['end'])
|
319 |
+
text = ' '.join(x['text'] for x in segment_words)
|
320 |
|
321 |
+
return self.tokenizer(
|
|
|
|
|
|
|
|
|
322 |
text, return_tensors=self.framework, **tokenizer_kwargs)
|
|
|
|
|
|
|
|
|
|
|
323 |
|
|
|
|
|
324 |
|
325 |
+
def postprocess(self, model_outputs, function_to_apply=None, return_all_scores=False):
|
326 |
results = super().postprocess(model_outputs, function_to_apply, return_all_scores)
|
327 |
|
328 |
for result in results:
|
329 |
result['label_text'] = CATEGORIES[result['label']]
|
330 |
|
331 |
+
return results
|