Joshua Lochner commited on
Commit
1fc070a
·
1 Parent(s): 9340261

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +26 -21
pipeline.py CHANGED
@@ -280,19 +280,28 @@ def binary_search_above(transcript, start_index, end_index, time):
280
 
281
  class PreTrainedPipeline():
282
  def __init__(self, path: str):
283
- path2 = os.path.join(path, "model")
284
  self.model2 = AutoModelForSequenceClassification.from_pretrained(path2)
285
  self.tokenizer2 = AutoTokenizer.from_pretrained(path2)
286
  self.pipeline2 = SponsorBlockClassificationPipeline(
287
  model=self.model2, tokenizer=self.tokenizer2)
288
 
289
  def __call__(self, inputs: str)-> List[Dict[str, Any]]:
290
- split = inputs.split(',')
291
- data = {
292
- 'video_id': split[0],
293
- 'start': float(split[1]),
294
- 'end': float(split[2])
295
- }
 
 
 
 
 
 
 
 
 
296
  return self.pipeline2(data)
297
 
298
 
@@ -301,26 +310,22 @@ class SponsorBlockClassificationPipeline(TextClassificationPipeline):
301
  def __init__(self, model, tokenizer):
302
  super().__init__(model=model, tokenizer=tokenizer, return_all_scores=True)
303
 
304
- def preprocess(self, video, **tokenizer_kwargs):
 
 
 
 
 
 
305
 
306
- words = get_words(video['video_id'])
307
- segment_words = extract_segment(words, video['start'], video['end'])
308
- text = ' '.join(x['text'] for x in segment_words)
309
-
310
- model_inputs = self.tokenizer(
311
  text, return_tensors=self.framework, **tokenizer_kwargs)
312
- return {'video': video, 'model_inputs': model_inputs}
313
-
314
- def _forward(self, data):
315
- model_outputs = self.model(**data['model_inputs'])
316
- return {'video': data['video'], 'model_outputs': model_outputs}
317
 
318
- def postprocess(self, data, function_to_apply=None, return_all_scores=False):
319
- model_outputs = data['model_outputs']
320
 
 
321
  results = super().postprocess(model_outputs, function_to_apply, return_all_scores)
322
 
323
  for result in results:
324
  result['label_text'] = CATEGORIES[result['label']]
325
 
326
- return results # {**data['video'], 'result': results}
 
280
 
281
  class PreTrainedPipeline():
282
  def __init__(self, path: str):
283
+ path2 = os.path.join(path, 'model')
284
  self.model2 = AutoModelForSequenceClassification.from_pretrained(path2)
285
  self.tokenizer2 = AutoTokenizer.from_pretrained(path2)
286
  self.pipeline2 = SponsorBlockClassificationPipeline(
287
  model=self.model2, tokenizer=self.tokenizer2)
288
 
289
  def __call__(self, inputs: str)-> List[Dict[str, Any]]:
290
+
291
+
292
+ if ' ' not in inputs and inputs.count(',') >= 2: # Automated call (compressed string)
293
+ split_info = inputs.split(',', 1)
294
+ times = np.reshape(np.array(split_info[1].split(',')), (-1, 2))
295
+ data = []
296
+ for start, end in times:
297
+ data.append({
298
+ 'video_id': split_info[0],
299
+ 'start': float(start),
300
+ 'end': float(end)
301
+ })
302
+ else:
303
+ data = inputs
304
+
305
  return self.pipeline2(data)
306
 
307
 
 
310
  def __init__(self, model, tokenizer):
311
  super().__init__(model=model, tokenizer=tokenizer, return_all_scores=True)
312
 
313
+ def preprocess(self, data, **tokenizer_kwargs):
314
+ if isinstance(data, str): # If string, assume this is what user wants to classify
315
+ text = data
316
+ else: # Otherwise, get data from transcript
317
+ words = get_words(video['video_id'])
318
+ segment_words = extract_segment(words, video['start'], video['end'])
319
+ text = ' '.join(x['text'] for x in segment_words)
320
 
321
+ return self.tokenizer(
 
 
 
 
322
  text, return_tensors=self.framework, **tokenizer_kwargs)
 
 
 
 
 
323
 
 
 
324
 
325
+ def postprocess(self, model_outputs, function_to_apply=None, return_all_scores=False):
326
  results = super().postprocess(model_outputs, function_to_apply, return_all_scores)
327
 
328
  for result in results:
329
  result['label_text'] = CATEGORIES[result['label']]
330
 
331
+ return results