diff --git a/LLAVA_Biovil/llava/eval/__init__.py b/LLAVA_Biovil/llava/eval/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/LLAVA_Biovil/llava/eval/eval_gpt_review.py b/LLAVA_Biovil/llava/eval/eval_gpt_review.py deleted file mode 100644 index 8af4559c65fc2728b11fd2097a109981ee1ef686..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/eval_gpt_review.py +++ /dev/null @@ -1,113 +0,0 @@ -import argparse -import json -import os - -import openai -import tqdm -import ray -import time - -NUM_SECONDS_TO_SLEEP = 3 - -@ray.remote(num_cpus=4) -def get_eval(content: str, max_tokens: int): - while True: - try: - response = openai.ChatCompletion.create( - model='gpt-4', - messages=[{ - 'role': 'system', - 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' - }, { - 'role': 'user', - 'content': content, - }], - temperature=0.2, # TODO: figure out which temperature is best for evaluation - max_tokens=max_tokens, - ) - break - except openai.error.RateLimitError: - pass - except Exception as e: - print(e) - time.sleep(NUM_SECONDS_TO_SLEEP) - - print('success!') - return response['choices'][0]['message']['content'] - - -def parse_score(review): - try: - score_pair = review.split('\n')[0] - score_pair = score_pair.replace(',', ' ') - sp = score_pair.split(' ') - if len(sp) == 2: - return [float(sp[0]), float(sp[1])] - else: - print('error', review) - return [-1, -1] - except Exception as e: - print(e) - print('error', review) - return [-1, -1] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') - parser.add_argument('-q', '--question') - # parser.add_argument('-a', '--answer') - parser.add_argument('-a', '--answer-list', nargs='+', default=[]) - parser.add_argument('-r', '--rule') - parser.add_argument('-o', '--output') - parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') - args = parser.parse_args() - - ray.init() - - f_q = open(os.path.expanduser(args.question)) - f_ans1 = open(os.path.expanduser(args.answer_list[0])) - f_ans2 = open(os.path.expanduser(args.answer_list[1])) - rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) - - review_file = open(f'{args.output}', 'w') - - js_list = [] - handles = [] - idx = 0 - for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): - # if idx == 1: - # break - - ques = json.loads(ques_js) - ans1 = json.loads(ans1_js) - ans2 = json.loads(ans2_js) - - category = json.loads(ques_js)['category'] - if category in rule_dict: - rule = rule_dict[category] - else: - rule = rule_dict['default'] - prompt = rule['prompt'] - role = rule['role'] - content = (f'[Question]\n{ques["text"]}\n\n' - f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' - f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' - f'[System]\n{prompt}\n\n') - js_list.append({ - 'id': idx+1, - 'question_id': ques['question_id'], - 'answer1_id': ans1['answer_id'], - 'answer2_id': ans2['answer_id'], - 'category': category}) - idx += 1 - handles.append(get_eval.remote(content, args.max_tokens)) - # To avoid the rate limit set by OpenAI - time.sleep(NUM_SECONDS_TO_SLEEP) - - reviews = ray.get(handles) - for idx, review in enumerate(reviews): - scores = parse_score(review) - js_list[idx]['content'] = review - js_list[idx]['tuple'] = scores - review_file.write(json.dumps(js_list[idx]) + '\n') - review_file.close() diff --git a/LLAVA_Biovil/llava/eval/eval_gpt_review_bench.py b/LLAVA_Biovil/llava/eval/eval_gpt_review_bench.py deleted file mode 100644 index 06160f2422b5368f30fb967f7cae635208a1dc69..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/eval_gpt_review_bench.py +++ /dev/null @@ -1,121 +0,0 @@ -import argparse -import json -import os - -import openai -import time - -NUM_SECONDS_TO_SLEEP = 0.5 - - -def get_eval(content: str, max_tokens: int): - while True: - try: - response = openai.ChatCompletion.create( - model='gpt-4-0314', - messages=[{ - 'role': 'system', - 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' - }, { - 'role': 'user', - 'content': content, - }], - temperature=0.2, # TODO: figure out which temperature is best for evaluation - max_tokens=max_tokens, - ) - break - except openai.error.RateLimitError: - pass - except Exception as e: - print(e) - time.sleep(NUM_SECONDS_TO_SLEEP) - - return response['choices'][0]['message']['content'] - - -def parse_score(review): - try: - score_pair = review.split('\n')[0] - score_pair = score_pair.replace(',', ' ') - sp = score_pair.split(' ') - if len(sp) == 2: - return [float(sp[0]), float(sp[1])] - else: - print('error', review) - return [-1, -1] - except Exception as e: - print(e) - print('error', review) - return [-1, -1] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') - parser.add_argument('-q', '--question') - parser.add_argument('-c', '--context') - parser.add_argument('-a', '--answer-list', nargs='+', default=[]) - parser.add_argument('-r', '--rule') - parser.add_argument('-o', '--output') - parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') - args = parser.parse_args() - - f_q = open(os.path.expanduser(args.question)) - f_ans1 = open(os.path.expanduser(args.answer_list[0])) - f_ans2 = open(os.path.expanduser(args.answer_list[1])) - rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) - - if os.path.isfile(os.path.expanduser(args.output)): - cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] - else: - cur_reviews = [] - - review_file = open(f'{args.output}', 'a') - - context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] - image_to_context = {context['image']: context for context in context_list} - - handles = [] - idx = 0 - for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): - ques = json.loads(ques_js) - ans1 = json.loads(ans1_js) - ans2 = json.loads(ans2_js) - - inst = image_to_context[ques['image']] - - if isinstance(inst['caption'], list): - cap_str = '\n'.join(inst['caption']) - else: - cap_str = inst['caption'] - - category = 'llava_bench_' + json.loads(ques_js)['category'] - if category in rule_dict: - rule = rule_dict[category] - else: - assert False, f"Visual QA category not found in rule file: {category}." - prompt = rule['prompt'] - role = rule['role'] - content = (f'[Context]\n{cap_str}\n\n' - f'[Question]\n{ques["text"]}\n\n' - f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' - f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' - f'[System]\n{prompt}\n\n') - cur_js = { - 'id': idx+1, - 'question_id': ques['question_id'], - 'answer1_id': ans1.get('answer_id', ans1['question_id']), - 'answer2_id': ans2.get('answer_id', ans2['answer_id']), - 'category': category - } - if idx >= len(cur_reviews): - review = get_eval(content, args.max_tokens) - scores = parse_score(review) - cur_js['content'] = review - cur_js['tuple'] = scores - review_file.write(json.dumps(cur_js) + '\n') - review_file.flush() - else: - print(f'Skipping {idx} as we already have it.') - idx += 1 - print(idx) - review_file.close() diff --git a/LLAVA_Biovil/llava/eval/eval_gpt_review_visual.py b/LLAVA_Biovil/llava/eval/eval_gpt_review_visual.py deleted file mode 100644 index d6e407a400a67020d801e6c27a3c32a2ee38f30c..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/eval_gpt_review_visual.py +++ /dev/null @@ -1,118 +0,0 @@ -import argparse -import json -import os - -import openai -import time - -NUM_SECONDS_TO_SLEEP = 0.5 - - -def get_eval(content: str, max_tokens: int): - while True: - try: - response = openai.ChatCompletion.create( - model='gpt-4-0314', - messages=[{ - 'role': 'system', - 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' - }, { - 'role': 'user', - 'content': content, - }], - temperature=0.2, # TODO: figure out which temperature is best for evaluation - max_tokens=max_tokens, - ) - break - except openai.error.RateLimitError: - pass - except Exception as e: - print(e) - time.sleep(NUM_SECONDS_TO_SLEEP) - - return response['choices'][0]['message']['content'] - - -def parse_score(review): - try: - score_pair = review.split('\n')[0] - score_pair = score_pair.replace(',', ' ') - sp = score_pair.split(' ') - if len(sp) == 2: - return [float(sp[0]), float(sp[1])] - else: - print('error', review) - return [-1, -1] - except Exception as e: - print(e) - print('error', review) - return [-1, -1] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') - parser.add_argument('-q', '--question') - parser.add_argument('-c', '--context') - parser.add_argument('-a', '--answer-list', nargs='+', default=[]) - parser.add_argument('-r', '--rule') - parser.add_argument('-o', '--output') - parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') - args = parser.parse_args() - - f_q = open(os.path.expanduser(args.question)) - f_ans1 = open(os.path.expanduser(args.answer_list[0])) - f_ans2 = open(os.path.expanduser(args.answer_list[1])) - rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) - - if os.path.isfile(os.path.expanduser(args.output)): - cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] - else: - cur_reviews = [] - - review_file = open(f'{args.output}', 'a') - - context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] - image_to_context = {context['image']: context for context in context_list} - - handles = [] - idx = 0 - for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): - ques = json.loads(ques_js) - ans1 = json.loads(ans1_js) - ans2 = json.loads(ans2_js) - - inst = image_to_context[ques['image']] - cap_str = '\n'.join(inst['captions']) - box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) - - category = json.loads(ques_js)['category'] - if category in rule_dict: - rule = rule_dict[category] - else: - assert False, f"Visual QA category not found in rule file: {category}." - prompt = rule['prompt'] - role = rule['role'] - content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' - f'[Question]\n{ques["text"]}\n\n' - f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' - f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' - f'[System]\n{prompt}\n\n') - cur_js = { - 'id': idx+1, - 'question_id': ques['question_id'], - 'answer1_id': ans1.get('answer_id', ans1['question_id']), - 'answer2_id': ans2.get('answer_id', ans2['answer_id']), - 'category': category - } - if idx >= len(cur_reviews): - review = get_eval(content, args.max_tokens) - scores = parse_score(review) - cur_js['content'] = review - cur_js['tuple'] = scores - review_file.write(json.dumps(cur_js) + '\n') - review_file.flush() - else: - print(f'Skipping {idx} as we already have it.') - idx += 1 - print(idx) - review_file.close() diff --git a/LLAVA_Biovil/llava/eval/eval_pope.py b/LLAVA_Biovil/llava/eval/eval_pope.py deleted file mode 100644 index b115b8f2327ea9d972f9e41bcbb03c68be6b3508..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/eval_pope.py +++ /dev/null @@ -1,81 +0,0 @@ -import os -import json -import argparse - -def eval_pope(answers, label_file): - label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] - - for answer in answers: - text = answer['text'] - - # Only keep the first sentence - if text.find('.') != -1: - text = text.split('.')[0] - - text = text.replace(',', '') - words = text.split(' ') - if 'No' in words or 'not' in words or 'no' in words: - answer['text'] = 'no' - else: - answer['text'] = 'yes' - - for i in range(len(label_list)): - if label_list[i] == 'no': - label_list[i] = 0 - else: - label_list[i] = 1 - - pred_list = [] - for answer in answers: - if answer['text'] == 'no': - pred_list.append(0) - else: - pred_list.append(1) - - pos = 1 - neg = 0 - yes_ratio = pred_list.count(1) / len(pred_list) - - TP, TN, FP, FN = 0, 0, 0, 0 - for pred, label in zip(pred_list, label_list): - if pred == pos and label == pos: - TP += 1 - elif pred == pos and label == neg: - FP += 1 - elif pred == neg and label == neg: - TN += 1 - elif pred == neg and label == pos: - FN += 1 - - print('TP\tFP\tTN\tFN\t') - print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) - - precision = float(TP) / float(TP + FP) - recall = float(TP) / float(TP + FN) - f1 = 2*precision*recall / (precision + recall) - acc = (TP + TN) / (TP + TN + FP + FN) - print('Accuracy: {}'.format(acc)) - print('Precision: {}'.format(precision)) - print('Recall: {}'.format(recall)) - print('F1 score: {}'.format(f1)) - print('Yes ratio: {}'.format(yes_ratio)) - print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--annotation-dir", type=str) - parser.add_argument("--question-file", type=str) - parser.add_argument("--result-file", type=str) - args = parser.parse_args() - - questions = [json.loads(line) for line in open(args.question_file)] - questions = {question['question_id']: question for question in questions} - answers = [json.loads(q) for q in open(args.result_file)] - for file in os.listdir(args.annotation_dir): - assert file.startswith('coco_pope_') - assert file.endswith('.json') - category = file[10:-5] - cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] - print('Category: {}, # samples: {}'.format(category, len(cur_answers))) - eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) - print("====================================") diff --git a/LLAVA_Biovil/llava/eval/eval_science_qa.py b/LLAVA_Biovil/llava/eval/eval_science_qa.py deleted file mode 100644 index ccf206bbd7a5d6376eef82d61b3ef8bbe0f71c6c..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/eval_science_qa.py +++ /dev/null @@ -1,114 +0,0 @@ -import argparse -import json -import os -import re -import random - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--base-dir', type=str) - parser.add_argument('--result-file', type=str) - parser.add_argument('--output-file', type=str) - parser.add_argument('--output-result', type=str) - parser.add_argument('--split', type=str, default='test') - parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) - return parser.parse_args() - - -def convert_caps(results): - fakecaps = [] - for result in results: - image_id = result['question_id'] - caption = result['text'] - fakecaps.append({"image_id": int(image_id), "caption": caption}) - return fakecaps - - -def get_pred_idx(prediction, choices, options): - """ - Get the index (e.g. 2) from the prediction (e.g. 'C') - """ - if prediction in options[:len(choices)]: - return options.index(prediction) - else: - return -1 - return random.choice(range(len(choices))) - - -if __name__ == "__main__": - args = get_args() - - base_dir = args.base_dir - split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] - problems = json.load(open(os.path.join(base_dir, "problems.json"))) - predictions = [json.loads(line) for line in open(args.result_file)] - predictions = {pred['question_id']: pred for pred in predictions} - split_problems = {idx: problems[idx] for idx in split_indices} - - results = {'correct': [], 'incorrect': []} - sqa_results = {} - sqa_results['acc'] = None - sqa_results['correct'] = None - sqa_results['count'] = None - sqa_results['results'] = {} - sqa_results['outputs'] = {} - - for prob_id, prob in split_problems.items(): - if prob_id not in predictions: - pred = {'text': 'FAILED', 'prompt': 'Unknown'} - pred_text = 'FAILED' - else: - pred = predictions[prob_id] - pred_text = pred['text'] - - if pred_text in args.options: - answer = pred_text - elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": - answer = pred_text[0] - else: - pattern = re.compile(r'The answer is ([A-Z]).') - res = pattern.findall(pred_text) - if len(res) == 1: - answer = res[0] # 'A', 'B', ... - else: - answer = "FAILED" - - pred_idx = get_pred_idx(answer, prob['choices'], args.options) - - analysis = { - 'question_id': prob_id, - 'parsed_ans': answer, - 'ground_truth': args.options[prob['answer']], - 'question': pred['prompt'], - 'pred': pred_text, - 'is_multimodal': '' in pred['prompt'], - } - - sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) - sqa_results['outputs'][prob_id] = pred_text - - if pred_idx == prob['answer']: - results['correct'].append(analysis) - else: - results['incorrect'].append(analysis) - - correct = len(results['correct']) - total = len(results['correct']) + len(results['incorrect']) - - ###### IMG ###### - multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) - multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) - multimodal_total = multimodal_correct + multimodal_incorrect - ###### IMG ###### - - print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') - - sqa_results['acc'] = correct / total * 100 - sqa_results['correct'] = correct - sqa_results['count'] = total - - with open(args.output_file, 'w') as f: - json.dump(results, f, indent=2) - with open(args.output_result, 'w') as f: - json.dump(sqa_results, f, indent=2) diff --git a/LLAVA_Biovil/llava/eval/eval_science_qa_gpt4.py b/LLAVA_Biovil/llava/eval/eval_science_qa_gpt4.py deleted file mode 100644 index c2ff17c915481fb556aba6ec816a9e08f519c515..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/eval_science_qa_gpt4.py +++ /dev/null @@ -1,104 +0,0 @@ -import argparse -import json -import os -import re -import random -from collections import defaultdict - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--base-dir', type=str) - parser.add_argument('--gpt4-result', type=str) - parser.add_argument('--our-result', type=str) - parser.add_argument('--split', type=str, default='test') - parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) - return parser.parse_args() - - -def convert_caps(results): - fakecaps = [] - for result in results: - image_id = result['question_id'] - caption = result['text'] - fakecaps.append({"image_id": int(image_id), "caption": caption}) - return fakecaps - - -def get_pred_idx(prediction, choices, options): - """ - Get the index (e.g. 2) from the prediction (e.g. 'C') - """ - if prediction in options[:len(choices)]: - return options.index(prediction) - else: - return random.choice(range(len(choices))) - - -if __name__ == "__main__": - args = get_args() - - base_dir = args.base_dir - split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] - problems = json.load(open(os.path.join(base_dir, "problems.json"))) - our_predictions = [json.loads(line) for line in open(args.our_result)] - our_predictions = {pred['question_id']: pred for pred in our_predictions} - split_problems = {idx: problems[idx] for idx in split_indices} - - gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] - - results = defaultdict(lambda: 0) - - for prob_id, prob in split_problems.items(): - if prob_id not in our_predictions: - continue - if prob_id not in gpt4_predictions: - continue - our_pred = our_predictions[prob_id]['text'] - gpt4_pred = gpt4_predictions[prob_id] - - pattern = re.compile(r'The answer is ([A-Z]).') - our_res = pattern.findall(our_pred) - if len(our_res) == 1: - our_answer = our_res[0] # 'A', 'B', ... - else: - our_answer = "FAILED" - gpt4_res = pattern.findall(gpt4_pred) - if len(gpt4_res) == 1: - gpt4_answer = gpt4_res[0] # 'A', 'B', ... - else: - gpt4_answer = "FAILED" - - our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) - gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) - - if gpt4_answer == 'FAILED': - results['gpt4_failed'] += 1 - # continue - gpt4_pred_idx = our_pred_idx - # if our_pred_idx != prob['answer']: - # print(our_predictions[prob_id]['prompt']) - # print('-----------------') - # print(f'LECTURE: {prob["lecture"]}') - # print(f'SOLUTION: {prob["solution"]}') - # print('=====================') - else: - # continue - pass - # gpt4_pred_idx = our_pred_idx - - if gpt4_pred_idx == prob['answer']: - results['correct'] += 1 - else: - results['incorrect'] += 1 - - - if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: - results['correct_upperbound'] += 1 - - correct = results['correct'] - total = results['correct'] + results['incorrect'] - print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') - print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') - print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') - diff --git a/LLAVA_Biovil/llava/eval/eval_science_qa_gpt4_requery.py b/LLAVA_Biovil/llava/eval/eval_science_qa_gpt4_requery.py deleted file mode 100644 index 698546e995d365d1ccc2c25a87e6c5cd681e6eb6..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/eval_science_qa_gpt4_requery.py +++ /dev/null @@ -1,149 +0,0 @@ -import argparse -import json -import os -import re -import random -from collections import defaultdict - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--base-dir', type=str) - parser.add_argument('--gpt4-result', type=str) - parser.add_argument('--requery-result', type=str) - parser.add_argument('--our-result', type=str) - parser.add_argument('--output-result', type=str) - parser.add_argument('--split', type=str, default='test') - parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) - return parser.parse_args() - - -def convert_caps(results): - fakecaps = [] - for result in results: - image_id = result['question_id'] - caption = result['text'] - fakecaps.append({"image_id": int(image_id), "caption": caption}) - return fakecaps - - -def get_pred_idx(prediction, choices, options): - """ - Get the index (e.g. 2) from the prediction (e.g. 'C') - """ - if prediction in options[:len(choices)]: - return options.index(prediction) - else: - return random.choice(range(len(choices))) - - -if __name__ == "__main__": - args = get_args() - - base_dir = args.base_dir - split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] - problems = json.load(open(os.path.join(base_dir, "problems.json"))) - our_predictions = [json.loads(line) for line in open(args.our_result)] - our_predictions = {pred['question_id']: pred for pred in our_predictions} - split_problems = {idx: problems[idx] for idx in split_indices} - - requery_predictions = [json.loads(line) for line in open(args.requery_result)] - requery_predictions = {pred['question_id']: pred for pred in requery_predictions} - - gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] - - results = defaultdict(lambda: 0) - - sqa_results = {} - sqa_results['acc'] = None - sqa_results['correct'] = None - sqa_results['count'] = None - sqa_results['results'] = {} - sqa_results['outputs'] = {} - - for prob_id, prob in split_problems.items(): - if prob_id not in our_predictions: - assert False - if prob_id not in gpt4_predictions: - assert False - our_pred = our_predictions[prob_id]['text'] - gpt4_pred = gpt4_predictions[prob_id] - if prob_id not in requery_predictions: - results['missing_requery'] += 1 - requery_pred = "MISSING" - else: - requery_pred = requery_predictions[prob_id]['text'] - - pattern = re.compile(r'The answer is ([A-Z]).') - our_res = pattern.findall(our_pred) - if len(our_res) == 1: - our_answer = our_res[0] # 'A', 'B', ... - else: - our_answer = "FAILED" - - requery_res = pattern.findall(requery_pred) - if len(requery_res) == 1: - requery_answer = requery_res[0] # 'A', 'B', ... - else: - requery_answer = "FAILED" - - gpt4_res = pattern.findall(gpt4_pred) - if len(gpt4_res) == 1: - gpt4_answer = gpt4_res[0] # 'A', 'B', ... - else: - gpt4_answer = "FAILED" - - our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) - gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) - requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options) - - results['total'] += 1 - - if gpt4_answer == 'FAILED': - results['gpt4_failed'] += 1 - if gpt4_pred_idx == prob['answer']: - results['gpt4_correct'] += 1 - if our_pred_idx == prob['answer']: - results['gpt4_ourvisual_correct'] += 1 - elif gpt4_pred_idx == prob['answer']: - results['gpt4_correct'] += 1 - results['gpt4_ourvisual_correct'] += 1 - - if our_pred_idx == prob['answer']: - results['our_correct'] += 1 - - if requery_answer == 'FAILED': - sqa_results['results'][prob_id] = our_pred_idx - if our_pred_idx == prob['answer']: - results['requery_correct'] += 1 - else: - sqa_results['results'][prob_id] = requery_pred_idx - if requery_pred_idx == prob['answer']: - results['requery_correct'] += 1 - else: - print(f""" -Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']} -Our ({our_answer}): {our_pred} -GPT-4 ({gpt4_answer}): {gpt4_pred} -Requery ({requery_answer}): {requery_pred} -print("=====================================") -""") - - if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: - results['correct_upperbound'] += 1 - - total = results['total'] - print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%') - print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%') - print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') - print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%') - print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%') - print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') - - sqa_results['acc'] = results["requery_correct"] / total * 100 - sqa_results['correct'] = results["requery_correct"] - sqa_results['count'] = total - - with open(args.output_result, 'w') as f: - json.dump(sqa_results, f, indent=2) - diff --git a/LLAVA_Biovil/llava/eval/eval_textvqa.py b/LLAVA_Biovil/llava/eval/eval_textvqa.py deleted file mode 100644 index 838b09e81df1ec9843c84908d39e14e418ba9d47..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/eval_textvqa.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import argparse -import json -import re - -from LLAV.llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--annotation-file', type=str) - parser.add_argument('--result-file', type=str) - parser.add_argument('--result-dir', type=str) - return parser.parse_args() - - -def prompt_processor(prompt): - if prompt.startswith('OCR tokens: '): - pattern = r"Question: (.*?) Short answer:" - match = re.search(pattern, prompt, re.DOTALL) - question = match.group(1) - elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: - if prompt.startswith('Reference OCR token:'): - question = prompt.split('\n')[1] - else: - question = prompt.split('\n')[0] - elif len(prompt.split('\n')) == 2: - question = prompt.split('\n')[0] - else: - assert False - - return question.lower() - - -def eval_single(annotation_file, result_file): - experiment_name = os.path.splitext(os.path.basename(result_file))[0] - print(experiment_name) - annotations = json.load(open(annotation_file))['data'] - annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} - results = [json.loads(line) for line in open(result_file)] - - pred_list = [] - for result in results: - annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] - pred_list.append({ - "pred_answer": result['text'], - "gt_answers": annotation['answers'], - }) - - evaluator = TextVQAAccuracyEvaluator() - print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) - - -if __name__ == "__main__": - args = get_args() - - if args.result_file is not None: - eval_single(args.annotation_file, args.result_file) - - if args.result_dir is not None: - for result_file in sorted(os.listdir(args.result_dir)): - if not result_file.endswith('.jsonl'): - print(f'Skipping {result_file}') - continue - eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) diff --git a/LLAVA_Biovil/llava/eval/generate_webpage_data_from_table.py b/LLAVA_Biovil/llava/eval/generate_webpage_data_from_table.py deleted file mode 100644 index 92602258ccd953a1d7137056aaf15c8de8166e21..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/generate_webpage_data_from_table.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Generate json file for webpage.""" -import json -import os -import re - -# models = ['llama', 'alpaca', 'gpt35', 'bard'] -models = ['vicuna'] - - -def read_jsonl(path: str, key: str=None): - data = [] - with open(os.path.expanduser(path)) as f: - for line in f: - if not line: - continue - data.append(json.loads(line)) - if key is not None: - data.sort(key=lambda x: x[key]) - data = {item[key]: item for item in data} - return data - - -def trim_hanging_lines(s: str, n: int) -> str: - s = s.strip() - for _ in range(n): - s = s.split('\n', 1)[1].strip() - return s - - -if __name__ == '__main__': - questions = read_jsonl('table/question.jsonl', key='question_id') - - # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id') - # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id') - # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id') - # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id') - vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id') - ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id') - - review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id') - # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id') - # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id') - # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id') - # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id') - - records = [] - for qid in questions.keys(): - r = { - 'id': qid, - 'category': questions[qid]['category'], - 'question': questions[qid]['text'], - 'answers': { - # 'alpaca': alpaca_answers[qid]['text'], - # 'llama': llama_answers[qid]['text'], - # 'bard': bard_answers[qid]['text'], - # 'gpt35': gpt35_answers[qid]['text'], - 'vicuna': vicuna_answers[qid]['text'], - 'ours': ours_answers[qid]['text'], - }, - 'evaluations': { - # 'alpaca': review_alpaca[qid]['text'], - # 'llama': review_llama[qid]['text'], - # 'bard': review_bard[qid]['text'], - 'vicuna': review_vicuna[qid]['content'], - # 'gpt35': review_gpt35[qid]['text'], - }, - 'scores': { - 'vicuna': review_vicuna[qid]['tuple'], - # 'alpaca': review_alpaca[qid]['score'], - # 'llama': review_llama[qid]['score'], - # 'bard': review_bard[qid]['score'], - # 'gpt35': review_gpt35[qid]['score'], - }, - } - - # cleanup data - cleaned_evals = {} - for k, v in r['evaluations'].items(): - v = v.strip() - lines = v.split('\n') - # trim the first line if it's a pair of numbers - if re.match(r'\d+[, ]+\d+', lines[0]): - lines = lines[1:] - v = '\n'.join(lines) - cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**') - - r['evaluations'] = cleaned_evals - records.append(r) - - # Reorder the records, this is optional - for r in records: - if r['id'] <= 20: - r['id'] += 60 - else: - r['id'] -= 20 - for r in records: - if r['id'] <= 50: - r['id'] += 10 - elif 50 < r['id'] <= 60: - r['id'] -= 50 - for r in records: - if r['id'] == 7: - r['id'] = 1 - elif r['id'] < 7: - r['id'] += 1 - - records.sort(key=lambda x: x['id']) - - # Write to file - with open('webpage/data.json', 'w') as f: - json.dump({'questions': records, 'models': models}, f, indent=2) diff --git a/LLAVA_Biovil/llava/eval/m4c_evaluator.py b/LLAVA_Biovil/llava/eval/m4c_evaluator.py deleted file mode 100644 index e30e958da061a4f0a0bfe34b12d2fcaeba7ff2f4..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/m4c_evaluator.py +++ /dev/null @@ -1,334 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -import re - -from tqdm import tqdm - - -class EvalAIAnswerProcessor: - """ - Processes an answer similar to Eval AI - copied from - https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897 - """ - - CONTRACTIONS = { - "aint": "ain't", - "arent": "aren't", - "cant": "can't", - "couldve": "could've", - "couldnt": "couldn't", - "couldn'tve": "couldn't've", - "couldnt've": "couldn't've", - "didnt": "didn't", - "doesnt": "doesn't", - "dont": "don't", - "hadnt": "hadn't", - "hadnt've": "hadn't've", - "hadn'tve": "hadn't've", - "hasnt": "hasn't", - "havent": "haven't", - "hed": "he'd", - "hed've": "he'd've", - "he'dve": "he'd've", - "hes": "he's", - "howd": "how'd", - "howll": "how'll", - "hows": "how's", - "Id've": "I'd've", - "I'dve": "I'd've", - "Im": "I'm", - "Ive": "I've", - "isnt": "isn't", - "itd": "it'd", - "itd've": "it'd've", - "it'dve": "it'd've", - "itll": "it'll", - "let's": "let's", - "maam": "ma'am", - "mightnt": "mightn't", - "mightnt've": "mightn't've", - "mightn'tve": "mightn't've", - "mightve": "might've", - "mustnt": "mustn't", - "mustve": "must've", - "neednt": "needn't", - "notve": "not've", - "oclock": "o'clock", - "oughtnt": "oughtn't", - "ow's'at": "'ow's'at", - "'ows'at": "'ow's'at", - "'ow'sat": "'ow's'at", - "shant": "shan't", - "shed've": "she'd've", - "she'dve": "she'd've", - "she's": "she's", - "shouldve": "should've", - "shouldnt": "shouldn't", - "shouldnt've": "shouldn't've", - "shouldn'tve": "shouldn't've", - "somebody'd": "somebodyd", - "somebodyd've": "somebody'd've", - "somebody'dve": "somebody'd've", - "somebodyll": "somebody'll", - "somebodys": "somebody's", - "someoned": "someone'd", - "someoned've": "someone'd've", - "someone'dve": "someone'd've", - "someonell": "someone'll", - "someones": "someone's", - "somethingd": "something'd", - "somethingd've": "something'd've", - "something'dve": "something'd've", - "somethingll": "something'll", - "thats": "that's", - "thered": "there'd", - "thered've": "there'd've", - "there'dve": "there'd've", - "therere": "there're", - "theres": "there's", - "theyd": "they'd", - "theyd've": "they'd've", - "they'dve": "they'd've", - "theyll": "they'll", - "theyre": "they're", - "theyve": "they've", - "twas": "'twas", - "wasnt": "wasn't", - "wed've": "we'd've", - "we'dve": "we'd've", - "weve": "we've", - "werent": "weren't", - "whatll": "what'll", - "whatre": "what're", - "whats": "what's", - "whatve": "what've", - "whens": "when's", - "whered": "where'd", - "wheres": "where's", - "whereve": "where've", - "whod": "who'd", - "whod've": "who'd've", - "who'dve": "who'd've", - "wholl": "who'll", - "whos": "who's", - "whove": "who've", - "whyll": "why'll", - "whyre": "why're", - "whys": "why's", - "wont": "won't", - "wouldve": "would've", - "wouldnt": "wouldn't", - "wouldnt've": "wouldn't've", - "wouldn'tve": "wouldn't've", - "yall": "y'all", - "yall'll": "y'all'll", - "y'allll": "y'all'll", - "yall'd've": "y'all'd've", - "y'alld've": "y'all'd've", - "y'all'dve": "y'all'd've", - "youd": "you'd", - "youd've": "you'd've", - "you'dve": "you'd've", - "youll": "you'll", - "youre": "you're", - "youve": "you've", - } - - NUMBER_MAP = { - "none": "0", - "zero": "0", - "one": "1", - "two": "2", - "three": "3", - "four": "4", - "five": "5", - "six": "6", - "seven": "7", - "eight": "8", - "nine": "9", - "ten": "10", - } - ARTICLES = ["a", "an", "the"] - PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)") - COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)") - PUNCTUATIONS = [ - ";", - r"/", - "[", - "]", - '"', - "{", - "}", - "(", - ")", - "=", - "+", - "\\", - "_", - "-", - ">", - "<", - "@", - "`", - ",", - "?", - "!", - ] - - def __init__(self, *args, **kwargs): - pass - - def word_tokenize(self, word): - word = word.lower() - word = word.replace(",", "").replace("?", "").replace("'s", " 's") - return word.strip() - - def process_punctuation(self, in_text): - out_text = in_text - for p in self.PUNCTUATIONS: - if (p + " " in in_text or " " + p in in_text) or ( - re.search(self.COMMA_STRIP, in_text) is not None - ): - out_text = out_text.replace(p, "") - else: - out_text = out_text.replace(p, " ") - out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE) - return out_text - - def process_digit_article(self, in_text): - out_text = [] - temp_text = in_text.lower().split() - for word in temp_text: - word = self.NUMBER_MAP.setdefault(word, word) - if word not in self.ARTICLES: - out_text.append(word) - else: - pass - for word_id, word in enumerate(out_text): - if word in self.CONTRACTIONS: - out_text[word_id] = self.CONTRACTIONS[word] - out_text = " ".join(out_text) - return out_text - - def __call__(self, item): - item = self.word_tokenize(item) - item = item.replace("\n", " ").replace("\t", " ").strip() - item = self.process_punctuation(item) - item = self.process_digit_article(item) - return item - - -class TextVQAAccuracyEvaluator: - def __init__(self): - self.answer_processor = EvalAIAnswerProcessor() - - def _compute_answer_scores(self, raw_answers): - """ - compute the accuracy (soft score) of human answers - """ - answers = [self.answer_processor(a) for a in raw_answers] - assert len(answers) == 10 - gt_answers = list(enumerate(answers)) - unique_answers = set(answers) - unique_answer_scores = {} - - for unique_answer in unique_answers: - accs = [] - for gt_answer in gt_answers: - other_answers = [item for item in gt_answers if item != gt_answer] - matching_answers = [ - item for item in other_answers if item[1] == unique_answer - ] - acc = min(1, float(len(matching_answers)) / 3) - accs.append(acc) - unique_answer_scores[unique_answer] = sum(accs) / len(accs) - - return unique_answer_scores - - def eval_pred_list(self, pred_list): - pred_scores = [] - for entry in tqdm(pred_list): - pred_answer = self.answer_processor(entry["pred_answer"]) - unique_answer_scores = self._compute_answer_scores(entry["gt_answers"]) - score = unique_answer_scores.get(pred_answer, 0.0) - pred_scores.append(score) - - accuracy = sum(pred_scores) / len(pred_scores) - return accuracy - - -class STVQAAccuracyEvaluator: - def __init__(self): - self.answer_processor = EvalAIAnswerProcessor() - - def eval_pred_list(self, pred_list): - pred_scores = [] - for entry in pred_list: - pred_answer = self.answer_processor(entry["pred_answer"]) - gts = [self.answer_processor(a) for a in entry["gt_answers"]] - score = 1.0 if pred_answer in gts else 0.0 - pred_scores.append(score) - - accuracy = sum(pred_scores) / len(pred_scores) - return accuracy - - -class STVQAANLSEvaluator: - def __init__(self): - import editdistance # install with `pip install editdistance` - - self.get_edit_distance = editdistance.eval - - def get_anls(self, s1, s2): - s1 = s1.lower().strip() - s2 = s2.lower().strip() - iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2)) - anls = iou if iou >= 0.5 else 0.0 - return anls - - def eval_pred_list(self, pred_list): - pred_scores = [] - for entry in pred_list: - anls = max( - self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"] - ) - pred_scores.append(anls) - - accuracy = sum(pred_scores) / len(pred_scores) - return accuracy - - -class TextCapsBleu4Evaluator: - def __init__(self): - # The following script requires Java 1.8.0 and pycocotools installed. - # The pycocoevalcap can be installed with pip as - # pip install git+https://github.com/ronghanghu/coco-caption.git@python23 - # Original pycocoevalcap code is at https://github.com/tylin/coco-caption - # but has no python3 support yet. - try: - from pycocoevalcap.bleu.bleu import Bleu - from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer - except ModuleNotFoundError: - print( - "Please install pycocoevalcap module using " - "pip install git+https://github.com/ronghanghu/coco-caption.git@python23" # noqa - ) - raise - - self.tokenizer = PTBTokenizer() - self.scorer = Bleu(4) - - def eval_pred_list(self, pred_list): - # Create reference and hypotheses captions. - gts = {} - res = {} - for idx, entry in enumerate(pred_list): - gts[idx] = [{"caption": a} for a in entry["gt_answers"]] - res[idx] = [{"caption": entry["pred_answer"]}] - - gts = self.tokenizer.tokenize(gts) - res = self.tokenizer.tokenize(res) - score, _ = self.scorer.compute_score(gts, res) - - bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4) - return bleu4 diff --git a/LLAVA_Biovil/llava/eval/model_qa.py b/LLAVA_Biovil/llava/eval/model_qa.py deleted file mode 100644 index 6fa3ef1263aadf63ef75187bbffbee8b23908ad9..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/model_qa.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse -from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria -import torch -import os -import json -from tqdm import tqdm -import shortuuid - -from LLAV.llava.conversation import default_conversation -from LLAV.llava.utils import disable_torch_init - - -# new stopping implementation -class KeywordsStoppingCriteria(StoppingCriteria): - def __init__(self, keywords, tokenizer, input_ids): - self.keywords = keywords - self.tokenizer = tokenizer - self.start_len = None - self.input_ids = input_ids - - def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: - if self.start_len is None: - self.start_len = self.input_ids.shape[1] - else: - outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0] - for keyword in self.keywords: - if keyword in outputs: - return True - return False - - -@torch.inference_mode() -def eval_model(model_name, questions_file, answers_file): - # Model - disable_torch_init() - model_name = os.path.expanduser(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) - model = AutoModelForCausalLM.from_pretrained(model_name, - torch_dtype=torch.float16).cuda() - - - ques_file = open(os.path.expanduser(questions_file), "r") - ans_file = open(os.path.expanduser(answers_file), "w") - for i, line in enumerate(tqdm(ques_file)): - idx = json.loads(line)["question_id"] - qs = json.loads(line)["text"] - cat = json.loads(line)["category"] - conv = default_conversation.copy() - conv.append_message(conv.roles[0], qs) - prompt = conv.get_prompt() - inputs = tokenizer([prompt]) - input_ids = torch.as_tensor(inputs.input_ids).cuda() - stopping_criteria = KeywordsStoppingCriteria([conv.sep], tokenizer, input_ids) - output_ids = model.generate( - input_ids, - do_sample=True, - use_cache=True, - temperature=0.7, - max_new_tokens=1024, - stopping_criteria=[stopping_criteria]) - outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] - try: - index = outputs.index(conv.sep, len(prompt)) - except ValueError: - outputs += conv.sep - index = outputs.index(conv.sep, len(prompt)) - - outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() - ans_id = shortuuid.uuid() - ans_file.write(json.dumps({"question_id": idx, - "text": outputs, - "answer_id": ans_id, - "model_id": model_name, - "metadata": {}}) + "\n") - ans_file.flush() - ans_file.close() - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-name", type=str, default="facebook/opt-350m") - parser.add_argument("--question-file", type=str, default="tables/question.jsonl") - parser.add_argument("--answers-file", type=str, default="answer.jsonl") - args = parser.parse_args() - - eval_model(args.model_name, args.question_file, args.answers_file) diff --git a/LLAVA_Biovil/llava/eval/model_vqa.py b/LLAVA_Biovil/llava/eval/model_vqa.py deleted file mode 100644 index c5add17b231f410bba4291f70174665c60509932..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/model_vqa.py +++ /dev/null @@ -1,112 +0,0 @@ -import argparse -import torch -import os -import json -from tqdm import tqdm -import shortuuid - -from LLAV.llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from LLAV.llava.conversation import conv_templates, SeparatorStyle -from LLAV.llava.model.builder import load_pretrained_model -from LLAV.llava.utils import disable_torch_init -from LLAV.llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria - -from PIL import Image -import math - - -def split_list(lst, n): - """Split a list into n (roughly) equal-sized chunks""" - chunk_size = math.ceil(len(lst) / n) # integer division - return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] - - -def get_chunk(lst, n, k): - chunks = split_list(lst, n) - return chunks[k] - - -def eval_model(args): - # Model - disable_torch_init() - model_path = os.path.expanduser(args.model_path) - model_name = get_model_name_from_path(model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) - - questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] - questions = get_chunk(questions, args.num_chunks, args.chunk_idx) - answers_file = os.path.expanduser(args.answers_file) - os.makedirs(os.path.dirname(answers_file), exist_ok=True) - ans_file = open(answers_file, "w") - for line in tqdm(questions): - idx = line["question_id"] - image_file = line["image"] - qs = line["text"] - cur_prompt = qs - if model.config.mm_use_im_start_end: - qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs - else: - qs = DEFAULT_IMAGE_TOKEN + '\n' + qs - - conv = conv_templates[args.conv_mode].copy() - conv.append_message(conv.roles[0], qs) - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() - - image = Image.open(os.path.join(args.image_folder, image_file)) - image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] - - stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 - keywords = [stop_str] - stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) - - with torch.inference_mode(): - output_ids = model.generate( - input_ids, - images=image_tensor.unsqueeze(0).half().cuda(), - do_sample=True if args.temperature > 0 else False, - temperature=args.temperature, - top_p=args.top_p, - num_beams=args.num_beams, - # no_repeat_ngram_size=3, - max_new_tokens=1024, - use_cache=True) - - input_token_len = input_ids.shape[1] - n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() - if n_diff_input_output > 0: - print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') - outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] - outputs = outputs.strip() - if outputs.endswith(stop_str): - outputs = outputs[:-len(stop_str)] - outputs = outputs.strip() - - ans_id = shortuuid.uuid() - ans_file.write(json.dumps({"question_id": idx, - "prompt": cur_prompt, - "text": outputs, - "answer_id": ans_id, - "model_id": model_name, - "metadata": {}}) + "\n") - ans_file.flush() - ans_file.close() - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default="facebook/opt-350m") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image-folder", type=str, default="") - parser.add_argument("--question-file", type=str, default="tables/question.jsonl") - parser.add_argument("--answers-file", type=str, default="answer.jsonl") - parser.add_argument("--conv-mode", type=str, default="llava_v1") - parser.add_argument("--num-chunks", type=int, default=1) - parser.add_argument("--chunk-idx", type=int, default=0) - parser.add_argument("--temperature", type=float, default=0.2) - parser.add_argument("--top_p", type=float, default=None) - parser.add_argument("--num_beams", type=int, default=1) - args = parser.parse_args() - - eval_model(args) diff --git a/LLAVA_Biovil/llava/eval/model_vqa_loader.py b/LLAVA_Biovil/llava/eval/model_vqa_loader.py deleted file mode 100644 index e8e0b31e68d5cd477acf890b7d0aa09aa11853a7..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/model_vqa_loader.py +++ /dev/null @@ -1,141 +0,0 @@ -import argparse -import torch -import os -import json -from tqdm import tqdm -import shortuuid - -from LLAV.llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from LLAV.llava.conversation import conv_templates -from LLAV.llava.model.builder import load_pretrained_model -from LLAV.llava.utils import disable_torch_init -from LLAV.llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path -from torch.utils.data import Dataset, DataLoader - -from PIL import Image -import math - - -def split_list(lst, n): - """Split a list into n (roughly) equal-sized chunks""" - chunk_size = math.ceil(len(lst) / n) # integer division - return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] - - -def get_chunk(lst, n, k): - chunks = split_list(lst, n) - return chunks[k] - - -# Custom dataset class -class CustomDataset(Dataset): - def __init__(self, questions, image_folder, tokenizer, image_processor, model_config): - self.questions = questions - self.image_folder = image_folder - self.tokenizer = tokenizer - self.image_processor = image_processor - self.model_config = model_config - - def __getitem__(self, index): - line = self.questions[index] - image_file = line["image"] - qs = line["text"] - if self.model_config.mm_use_im_start_end: - qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs - else: - qs = DEFAULT_IMAGE_TOKEN + '\n' + qs - - conv = conv_templates[args.conv_mode].copy() - conv.append_message(conv.roles[0], qs) - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') - image_tensor = process_images([image], self.image_processor, self.model_config)[0] - - input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') - - return input_ids, image_tensor - - def __len__(self): - return len(self.questions) - - -# DataLoader -def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4): - assert batch_size == 1, "batch_size must be 1" - dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config) - data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) - return data_loader - - -def eval_model(args): - # Model - disable_torch_init() - model_path = os.path.expanduser(args.model_path) - model_name = get_model_name_from_path(model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) - - questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] - questions = get_chunk(questions, args.num_chunks, args.chunk_idx) - answers_file = os.path.expanduser(args.answers_file) - os.makedirs(os.path.dirname(answers_file), exist_ok=True) - ans_file = open(answers_file, "w") - - if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: - args.conv_mode = args.conv_mode + '_mmtag' - print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') - - data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config) - - for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): - idx = line["question_id"] - cur_prompt = line["text"] - - input_ids = input_ids.to(device='cuda', non_blocking=True) - - with torch.inference_mode(): - output_ids = model.generate( - input_ids, - images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), - do_sample=True if args.temperature > 0 else False, - temperature=args.temperature, - top_p=args.top_p, - num_beams=args.num_beams, - max_new_tokens=args.max_new_tokens, - use_cache=True) - - input_token_len = input_ids.shape[1] - n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() - if n_diff_input_output > 0: - print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') - outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] - outputs = outputs.strip() - - ans_id = shortuuid.uuid() - ans_file.write(json.dumps({"question_id": idx, - "prompt": cur_prompt, - "text": outputs, - "answer_id": ans_id, - "model_id": model_name, - "metadata": {}}) + "\n") - # ans_file.flush() - ans_file.close() - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default="facebook/opt-350m") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image-folder", type=str, default="") - parser.add_argument("--question-file", type=str, default="tables/question.jsonl") - parser.add_argument("--answers-file", type=str, default="answer.jsonl") - parser.add_argument("--conv-mode", type=str, default="llava_v1") - parser.add_argument("--num-chunks", type=int, default=1) - parser.add_argument("--chunk-idx", type=int, default=0) - parser.add_argument("--temperature", type=float, default=0.2) - parser.add_argument("--top_p", type=float, default=None) - parser.add_argument("--num_beams", type=int, default=1) - parser.add_argument("--max_new_tokens", type=int, default=128) - args = parser.parse_args() - - eval_model(args) diff --git a/LLAVA_Biovil/llava/eval/model_vqa_mmbench.py b/LLAVA_Biovil/llava/eval/model_vqa_mmbench.py deleted file mode 100644 index e1622d0f785b808e9965a37ce37b89c69519de5e..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/model_vqa_mmbench.py +++ /dev/null @@ -1,169 +0,0 @@ -import argparse -import torch -import os -import json -import pandas as pd -from tqdm import tqdm -import shortuuid - -from LLAV.llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from LLAV.llava.conversation import conv_templates, SeparatorStyle -from LLAV.llava.model.builder import load_pretrained_model -from LLAV.llava.utils import disable_torch_init -from LLAV.llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path - -import math - - -all_options = ['A', 'B', 'C', 'D'] - - -def split_list(lst, n): - """Split a list into n (roughly) equal-sized chunks""" - chunk_size = math.ceil(len(lst) / n) # integer division - return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] - - -def get_chunk(lst, n, k): - chunks = split_list(lst, n) - return chunks[k] - - -def is_none(value): - if value is None: - return True - if type(value) is float and math.isnan(value): - return True - if type(value) is str and value.lower() == 'nan': - return True - if type(value) is str and value.lower() == 'none': - return True - return False - -def get_options(row, options): - parsed_options = [] - for option in options: - option_value = row[option] - if is_none(option_value): - break - parsed_options.append(option_value) - return parsed_options - - -def eval_model(args): - # Model - disable_torch_init() - model_path = os.path.expanduser(args.model_path) - model_name = get_model_name_from_path(model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) - - questions = pd.read_table(os.path.expanduser(args.question_file)) - questions = get_chunk(questions, args.num_chunks, args.chunk_idx) - answers_file = os.path.expanduser(args.answers_file) - os.makedirs(os.path.dirname(answers_file), exist_ok=True) - ans_file = open(answers_file, "w") - - if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: - args.conv_mode = args.conv_mode + '_mmtag' - print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') - - for index, row in tqdm(questions.iterrows(), total=len(questions)): - options = get_options(row, all_options) - cur_option_char = all_options[:len(options)] - - if args.all_rounds: - num_rounds = len(options) - else: - num_rounds = 1 - - for round_idx in range(num_rounds): - idx = row['index'] - question = row['question'] - hint = row['hint'] - image = load_image_from_base64(row['image']) - if not is_none(hint): - question = hint + '\n' + question - for option_char, option in zip(all_options[:len(options)], options): - question = question + '\n' + option_char + '. ' + option - qs = cur_prompt = question - if model.config.mm_use_im_start_end: - qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs - else: - qs = DEFAULT_IMAGE_TOKEN + '\n' + qs - - if args.single_pred_prompt: - if args.lang == 'cn': - qs = qs + '\n' + "请直接回答选项字母。" - else: - qs = qs + '\n' + "Answer with the option's letter from the given choices directly." - - conv = conv_templates[args.conv_mode].copy() - conv.append_message(conv.roles[0], qs) - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() - - image_tensor = process_images([image], image_processor, model.config)[0] - # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] - - stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 - - with torch.inference_mode(): - output_ids = model.generate( - input_ids, - images=image_tensor.unsqueeze(0).half().cuda(), - do_sample=True if args.temperature > 0 else False, - temperature=args.temperature, - top_p=args.top_p, - num_beams=args.num_beams, - # no_repeat_ngram_size=3, - max_new_tokens=1024, - use_cache=True) - - input_token_len = input_ids.shape[1] - n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() - if n_diff_input_output > 0: - print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') - outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] - outputs = outputs.strip() - if outputs.endswith(stop_str): - outputs = outputs[:-len(stop_str)] - outputs = outputs.strip() - - ans_id = shortuuid.uuid() - ans_file.write(json.dumps({"question_id": idx, - "round_id": round_idx, - "prompt": cur_prompt, - "text": outputs, - "options": options, - "option_char": cur_option_char, - "answer_id": ans_id, - "model_id": model_name, - "metadata": {}}) + "\n") - ans_file.flush() - - # rotate options - options = options[1:] + options[:1] - cur_option_char = cur_option_char[1:] + cur_option_char[:1] - ans_file.close() - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default="facebook/opt-350m") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image-folder", type=str, default="") - parser.add_argument("--question-file", type=str, default="tables/question.jsonl") - parser.add_argument("--answers-file", type=str, default="answer.jsonl") - parser.add_argument("--conv-mode", type=str, default="llava_v1") - parser.add_argument("--num-chunks", type=int, default=1) - parser.add_argument("--chunk-idx", type=int, default=0) - parser.add_argument("--temperature", type=float, default=0.2) - parser.add_argument("--top_p", type=float, default=None) - parser.add_argument("--num_beams", type=int, default=1) - parser.add_argument("--all-rounds", action="store_true") - parser.add_argument("--single-pred-prompt", action="store_true") - parser.add_argument("--lang", type=str, default="en") - args = parser.parse_args() - - eval_model(args) diff --git a/LLAVA_Biovil/llava/eval/model_vqa_qbench.py b/LLAVA_Biovil/llava/eval/model_vqa_qbench.py deleted file mode 100644 index 6cca9f5e39f7aea76a64115506d77864eba3507d..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/model_vqa_qbench.py +++ /dev/null @@ -1,120 +0,0 @@ -import argparse -import torch -from tqdm import tqdm -import json - -from LLAV.llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from LLAV.llava.conversation import conv_templates, SeparatorStyle -from LLAV.llava.model.builder import load_pretrained_model -from LLAV.llava.utils import disable_torch_init -from LLAV.llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria - -import requests -from PIL import Image -from io import BytesIO - - -def load_image(image_file): - if image_file.startswith('http') or image_file.startswith('https'): - response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert('RGB') - else: - image = Image.open(image_file).convert('RGB') - return image - - -def eval_model(args): - # Model - disable_torch_init() - - model_name = get_model_name_from_path(args.model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, True) - - - - - with open(args.questions_file) as f: - llvqa_data = json.load(f) - - for i, llddata in enumerate(tqdm(llvqa_data)): - filename = llddata["img_path"] - if args.lang == "en": - message = llddata["question"] + "\nChoose between one of the options as follows:\n" - elif args.lang == "zh": - message = llddata["question"] + "\在下列选项中选择一个:\n" - else: - raise NotImplementedError("Q-Bench does not support languages other than English (en) and Chinese (zh) yet. Contact us (https://github.com/VQAssessment/Q-Bench/) to convert Q-Bench into more languages.") - for choice, ans in zip(["A.", "B.", "C.", "D."], llddata["candidates"]): - message += f"{choice} {ans}\n" - qs = message - - if model.config.mm_use_im_start_end: - qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs - else: - qs = DEFAULT_IMAGE_TOKEN + '\n' + qs - - if 'llama-2' in model_name.lower(): - conv_mode = "llava_llama_2" - elif "v1" in model_name.lower(): - conv_mode = "llava_v1" - elif "mpt" in model_name.lower(): - conv_mode = "mpt" - else: - conv_mode = "llava_v0" - - if args.conv_mode is not None and conv_mode != args.conv_mode: - print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode)) - else: - args.conv_mode = conv_mode - - conv = conv_templates[args.conv_mode].copy() - conv.append_message(conv.roles[0], qs) - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - image = load_image(args.image_folder + filename) - image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda() - - input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() - - stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 - keywords = [stop_str] - stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) - - - with torch.inference_mode(): - output_ids = model.generate( - input_ids, - images=image_tensor, - num_beams=1, - do_sample=False, - temperature=0, - max_new_tokens=1024, - use_cache=True, - stopping_criteria=[stopping_criteria]) - - input_token_len = input_ids.shape[1] - n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() - if n_diff_input_output > 0: - print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') - outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] - outputs = outputs.strip() - if outputs.endswith(stop_str): - outputs = outputs[:-len(stop_str)] - outputs = outputs.strip() - llddata["response"] = outputs - with open(args.answers_file, "a") as wf: - json.dump(llddata, wf) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default="llava-v1.5") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image-folder", type=str, default="./playground/data/qbench/images_llvisionqa") - parser.add_argument("--questions-file", type=str, default="./playground/data/qbench/llvisionqa_dev.json") - parser.add_argument("--answers-file", type=str, default="answer.jsonl") - parser.add_argument("--conv-mode", type=str, default="llava_v1") - parser.add_argument("--lang", type=str, default="en") - args = parser.parse_args() - - eval_model(args) diff --git a/LLAVA_Biovil/llava/eval/model_vqa_science.py b/LLAVA_Biovil/llava/eval/model_vqa_science.py deleted file mode 100644 index a99f3da2b0787e5758eb49d51996ee1efc722d3b..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/model_vqa_science.py +++ /dev/null @@ -1,147 +0,0 @@ -import argparse -import torch -import os -import json -from tqdm import tqdm -import shortuuid - -from LLAV.llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from LLAV.llava.conversation import conv_templates, SeparatorStyle -from LLAV.llava.model.builder import load_pretrained_model -from LLAV.llava.utils import disable_torch_init -from LLAV.llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria - -from PIL import Image -import math - - -def split_list(lst, n): - """Split a list into n (roughly) equal-sized chunks""" - chunk_size = math.ceil(len(lst) / n) # integer division - return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] - - -def get_chunk(lst, n, k): - chunks = split_list(lst, n) - return chunks[k] - - -def eval_model(args): - # Model - disable_torch_init() - model_path = os.path.expanduser(args.model_path) - model_name = get_model_name_from_path(model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) - - questions = json.load(open(os.path.expanduser(args.question_file), "r")) - questions = get_chunk(questions, args.num_chunks, args.chunk_idx) - answers_file = os.path.expanduser(args.answers_file) - os.makedirs(os.path.dirname(answers_file), exist_ok=True) - ans_file = open(answers_file, "w") - for i, line in enumerate(tqdm(questions)): - idx = line["id"] - question = line['conversations'][0] - qs = question['value'].replace('', '').strip() - cur_prompt = qs - - if 'image' in line: - image_file = line["image"] - image = Image.open(os.path.join(args.image_folder, image_file)) - image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] - images = image_tensor.unsqueeze(0).half().cuda() - if getattr(model.config, 'mm_use_im_start_end', False): - qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs - else: - qs = DEFAULT_IMAGE_TOKEN + '\n' + qs - cur_prompt = '' + '\n' + cur_prompt - else: - images = None - - if args.single_pred_prompt: - qs = qs + '\n' + "Answer with the option's letter from the given choices directly." - cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly." - - conv = conv_templates[args.conv_mode].copy() - conv.append_message(conv.roles[0], qs) - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() - - stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 - keywords = [stop_str] - stopping_criteria = [KeywordsStoppingCriteria(keywords, tokenizer, input_ids)] if conv.version == "v0" else None - - with torch.inference_mode(): - output_ids = model.generate( - input_ids, - images=images, - do_sample=True if args.temperature > 0 else False, - temperature=args.temperature, - max_new_tokens=1024, - use_cache=True, - stopping_criteria=stopping_criteria, - ) - - input_token_len = input_ids.shape[1] - n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() - if n_diff_input_output > 0: - print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') - outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] - outputs = outputs.strip() - if outputs.endswith(stop_str): - outputs = outputs[:-len(stop_str)] - outputs = outputs.strip() - - # prompt for answer - if args.answer_prompter: - outputs_reasoning = outputs - input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' ###\nANSWER:', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() - - with torch.inference_mode(): - output_ids = model.generate( - input_ids, - images=images, - do_sample=True if args.temperature > 0 else False, - temperature=args.temperature, - max_new_tokens=64, - use_cache=True, - stopping_criteria=[stopping_criteria]) - - input_token_len = input_ids.shape[1] - n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() - if n_diff_input_output > 0: - print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') - outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] - outputs = outputs.strip() - if outputs.endswith(stop_str): - outputs = outputs[:-len(stop_str)] - outputs = outputs.strip() - outputs = outputs_reasoning + '\n The answer is ' + outputs - - ans_id = shortuuid.uuid() - ans_file.write(json.dumps({"question_id": idx, - "prompt": cur_prompt, - "text": outputs, - "answer_id": ans_id, - "model_id": model_name, - "metadata": {}}) + "\n") - ans_file.flush() - ans_file.close() - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default="facebook/opt-350m") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image-folder", type=str, default="") - parser.add_argument("--question-file", type=str, default="tables/question.json") - parser.add_argument("--answers-file", type=str, default="answer.jsonl") - parser.add_argument("--conv-mode", type=str, default="llava_v0") - parser.add_argument("--num-chunks", type=int, default=1) - parser.add_argument("--chunk-idx", type=int, default=0) - parser.add_argument("--temperature", type=float, default=0.2) - parser.add_argument("--answer-prompter", action="store_true") - parser.add_argument("--single-pred-prompt", action="store_true") - args = parser.parse_args() - - eval_model(args) diff --git a/LLAVA_Biovil/llava/eval/qa_baseline_gpt35.py b/LLAVA_Biovil/llava/eval/qa_baseline_gpt35.py deleted file mode 100644 index babab6e12b4bb8cfa74a7edfa5e56cd1b3e2bf6c..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/qa_baseline_gpt35.py +++ /dev/null @@ -1,74 +0,0 @@ -"""Generate answers with GPT-3.5""" -# Note: you need to be using OpenAI Python v0.27.0 for the code below to work -import argparse -import json -import os -import time -import concurrent.futures - -import openai -import tqdm -import shortuuid - -MODEL = 'gpt-3.5-turbo' -MODEL_ID = 'gpt-3.5-turbo:20230327' - -def get_answer(question_id: int, question: str, max_tokens: int): - ans = { - 'answer_id': shortuuid.uuid(), - 'question_id': question_id, - 'model_id': MODEL_ID, - } - for _ in range(3): - try: - response = openai.ChatCompletion.create( - model=MODEL, - messages=[{ - 'role': 'system', - 'content': 'You are a helpful assistant.' - }, { - 'role': 'user', - 'content': question, - }], - max_tokens=max_tokens, - ) - ans['text'] = response['choices'][0]['message']['content'] - return ans - except Exception as e: - print('[ERROR]', e) - ans['text'] = '#ERROR#' - time.sleep(1) - return ans - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='ChatGPT answer generation.') - parser.add_argument('-q', '--question') - parser.add_argument('-o', '--output') - parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') - args = parser.parse_args() - - questions_dict = {} - with open(os.path.expanduser(args.question)) as f: - for line in f: - if not line: - continue - q = json.loads(line) - questions_dict[q['question_id']] = q['text'] - - answers = [] - - with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: - futures = [] - for qid, question in questions_dict.items(): - future = executor.submit(get_answer, qid, question, args.max_tokens) - futures.append(future) - - for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): - answers.append(future.result()) - - answers.sort(key=lambda x: x['question_id']) - - with open(os.path.expanduser(args.output), 'w') as f: - table = [json.dumps(ans) for ans in answers] - f.write('\n'.join(table)) diff --git a/LLAVA_Biovil/llava/eval/run_llava.py b/LLAVA_Biovil/llava/eval/run_llava.py deleted file mode 100644 index 84df731419fc62669340cc34e8b325162507211e..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/run_llava.py +++ /dev/null @@ -1,155 +0,0 @@ -import argparse -import torch - -from LLAV.llava.constants import ( - IMAGE_TOKEN_INDEX, - DEFAULT_IMAGE_TOKEN, - DEFAULT_IM_START_TOKEN, - DEFAULT_IM_END_TOKEN, - IMAGE_PLACEHOLDER, -) -from LLAV.llava.conversation import conv_templates, SeparatorStyle -from LLAV.llava.model.builder import load_pretrained_model -from LLAV.llava.utils import disable_torch_init -from LLAV.llava.mm_utils import ( - process_images, - tokenizer_image_token, - get_model_name_from_path, - KeywordsStoppingCriteria, -) - -import requests -from PIL import Image -from io import BytesIO -import re - - -def image_parser(args): - out = args.image_file.split(args.sep) - return out - - -def load_image(image_file): - if image_file.startswith("http") or image_file.startswith("https"): - response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert("RGB") - else: - image = Image.open(image_file).convert("RGB") - return image - - -def load_images(image_files): - out = [] - for image_file in image_files: - image = load_image(image_file) - out.append(image) - return out - - -def eval_model(args): - # Model - disable_torch_init() - - model_name = get_model_name_from_path(args.model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model( - args.model_path, args.model_base, model_name - ) - - qs = args.query - image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN - if IMAGE_PLACEHOLDER in qs: - if model.config.mm_use_im_start_end: - qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs) - else: - qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs) - else: - if model.config.mm_use_im_start_end: - qs = image_token_se + "\n" + qs - else: - qs = DEFAULT_IMAGE_TOKEN + "\n" + qs - - if "llama-2" in model_name.lower(): - conv_mode = "llava_llama_2" - elif "v1" in model_name.lower(): - conv_mode = "llava_v1" - elif "mpt" in model_name.lower(): - conv_mode = "mpt" - else: - conv_mode = "llava_v0" - - if args.conv_mode is not None and conv_mode != args.conv_mode: - print( - "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format( - conv_mode, args.conv_mode, args.conv_mode - ) - ) - else: - args.conv_mode = conv_mode - - conv = conv_templates[args.conv_mode].copy() - conv.append_message(conv.roles[0], qs) - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - image_files = image_parser(args) - images = load_images(image_files) - images_tensor = process_images( - images, - image_processor, - model.config - ).to(model.device, dtype=torch.float16) - - input_ids = ( - tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") - .unsqueeze(0) - .cuda() - ) - - stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 - keywords = [stop_str] - stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) - - with torch.inference_mode(): - output_ids = model.generate( - input_ids, - images=images_tensor, - do_sample=True if args.temperature > 0 else False, - temperature=args.temperature, - top_p=args.top_p, - num_beams=args.num_beams, - max_new_tokens=args.max_new_tokens, - use_cache=True, - stopping_criteria=[stopping_criteria], - ) - - input_token_len = input_ids.shape[1] - n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() - if n_diff_input_output > 0: - print( - f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids" - ) - outputs = tokenizer.batch_decode( - output_ids[:, input_token_len:], skip_special_tokens=True - )[0] - outputs = outputs.strip() - if outputs.endswith(stop_str): - outputs = outputs[: -len(stop_str)] - outputs = outputs.strip() - print(outputs) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default="facebook/opt-350m") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image-file", type=str, required=True) - parser.add_argument("--query", type=str, required=True) - parser.add_argument("--conv-mode", type=str, default=None) - parser.add_argument("--sep", type=str, default=",") - parser.add_argument("--temperature", type=float, default=0.2) - parser.add_argument("--top_p", type=float, default=None) - parser.add_argument("--num_beams", type=int, default=1) - parser.add_argument("--max_new_tokens", type=int, default=512) - args = parser.parse_args() - - eval_model(args) diff --git a/LLAVA_Biovil/llava/eval/summarize_gpt_review.py b/LLAVA_Biovil/llava/eval/summarize_gpt_review.py deleted file mode 100644 index 0f796a3880341739677a5fe3bfbcc90515a0f324..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/summarize_gpt_review.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -import os -from collections import defaultdict - -import numpy as np - -import argparse - -def parse_args(): - parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') - parser.add_argument('-d', '--dir', default=None) - parser.add_argument('-v', '--version', default=None) - parser.add_argument('-s', '--select', nargs='*', default=None) - parser.add_argument('-f', '--files', nargs='*', default=[]) - parser.add_argument('-i', '--ignore', nargs='*', default=[]) - return parser.parse_args() - - -if __name__ == '__main__': - args = parse_args() - - if args.ignore is not None: - args.ignore = [int(x) for x in args.ignore] - - if len(args.files) > 0: - review_files = args.files - else: - review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] - - for review_file in sorted(review_files): - config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') - if args.select is not None and any(x not in config for x in args.select): - continue - if '0613' in config: - version = '0613' - else: - version = '0314' - if args.version is not None and args.version != version: - continue - scores = defaultdict(list) - print(config) - with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: - for review_str in f: - review = json.loads(review_str) - if review['question_id'] in args.ignore: - continue - if 'category' in review: - scores[review['category']].append(review['tuple']) - scores['all'].append(review['tuple']) - else: - if 'tuple' in review: - scores['all'].append(review['tuple']) - else: - scores['all'].append(review['score']) - for k, v in sorted(scores.items()): - stats = np.asarray(v).mean(0).tolist() - stats = [round(x, 3) for x in stats] - # print(k, stats, round(stats[1]/stats[0]*100, 1)) - print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) - print('=================================') diff --git a/LLAVA_Biovil/llava/eval/webpage/figures/alpaca.png b/LLAVA_Biovil/llava/eval/webpage/figures/alpaca.png deleted file mode 100644 index 497a702ab5efb88b8f67333eae81645eecea78cd..0000000000000000000000000000000000000000 Binary files a/LLAVA_Biovil/llava/eval/webpage/figures/alpaca.png and /dev/null differ diff --git a/LLAVA_Biovil/llava/eval/webpage/figures/bard.jpg b/LLAVA_Biovil/llava/eval/webpage/figures/bard.jpg deleted file mode 100644 index 5b32cb501799175e3829f92b014795ad1cbee79d..0000000000000000000000000000000000000000 Binary files a/LLAVA_Biovil/llava/eval/webpage/figures/bard.jpg and /dev/null differ diff --git a/LLAVA_Biovil/llava/eval/webpage/figures/chatgpt.svg b/LLAVA_Biovil/llava/eval/webpage/figures/chatgpt.svg deleted file mode 100644 index 8147382a3152de03c24b4cd91f9870ced1a95d54..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/webpage/figures/chatgpt.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/LLAVA_Biovil/llava/eval/webpage/figures/llama.jpg b/LLAVA_Biovil/llava/eval/webpage/figures/llama.jpg deleted file mode 100644 index 7217e5dc1bb683453204a20890f01f5806ce12cf..0000000000000000000000000000000000000000 Binary files a/LLAVA_Biovil/llava/eval/webpage/figures/llama.jpg and /dev/null differ diff --git a/LLAVA_Biovil/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg b/LLAVA_Biovil/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg deleted file mode 100644 index 3bee468d34515fdcbef1a8b8803c9fc4f7dc0b34..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/LLAVA_Biovil/llava/eval/webpage/figures/vicuna.jpeg b/LLAVA_Biovil/llava/eval/webpage/figures/vicuna.jpeg deleted file mode 100644 index e7883dc886b96d078883e01aefd16792133e204a..0000000000000000000000000000000000000000 Binary files a/LLAVA_Biovil/llava/eval/webpage/figures/vicuna.jpeg and /dev/null differ diff --git a/LLAVA_Biovil/llava/eval/webpage/index.html b/LLAVA_Biovil/llava/eval/webpage/index.html deleted file mode 100644 index c2e3cf020ba7d8e064f2cd801788a5d2d50b97da..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/webpage/index.html +++ /dev/null @@ -1,162 +0,0 @@ - - - - - - Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots - - - - - - - - -
-

Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots

- - -
-
- - -
-
- - -
-
-
-
- - -
-
-
- - -
-
- -
-
-
- other logo -
-
-
-
- - -
-
-
-
- vicuna logo -
-
-
- -
-
- - -
-
-
- - -
-
-
-
-
-
- -
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
- Assistant #2 (Vicuna, our model) -
-
-
-
-
-
-
-
-
-
- - -
-
GPT-4 Evaluation
-
-
-
-
-
-
-
-
- -
-
- This website is co-authored with GPT-4. -
-
- - - - - - - - - - - - - diff --git a/LLAVA_Biovil/llava/eval/webpage/script.js b/LLAVA_Biovil/llava/eval/webpage/script.js deleted file mode 100644 index 4b71e3d5618a262e4746f58e5d10947b73370dca..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/webpage/script.js +++ /dev/null @@ -1,245 +0,0 @@ -// Description: Script for the evaluation webpage. - -let currentQuestionIndex = 1; - -// Store the model name mapping for later use. -modelNameMapping = { - "gpt35": "ChatGPT-3.5", - "gpt4": "GPT-4", - "alpaca": "Alpaca-13b", - "vicuna": "Vicuna-13b", - "llama": "LLaMA-13b", - "bard": "Bard", -}; - -modelFigureMapping = { - "vicuna": "figures/vicuna.jpeg", - // Image from: https://commons.wikimedia.org/wiki/File:ChatGPT_logo.svg - "gpt35": "figures/chatgpt.svg", - // Image from: https://www.reddit.com/r/logodesign/comments/1128aat/google_ai_bard_logo_design/ - "bard": "figures/bard.jpg", - // Image from: https://crfm.stanford.edu/2023/03/13/alpaca.html - "alpaca": "figures/alpaca.png", - // Image adapted from https://commons.wikimedia.org/wiki/File:Llama_on_Machu_Picchu.jpg - "llama": "figures/llama.jpg", -} - -// Store the question data in a mapping for later use. -questionMapping = {}; -// Store the question ids in a mapping for later use. -categoryMapping = {}; -// Store the number of questions for later use. -questionsCount = 0; - - -function text2Markdown(text) { - // Normalize the text for markdown rendering. - text = text.trim().replaceAll('\n\n', '\n').replaceAll('\n', '\n\n'); - return marked.parse(text); -} - -function capitalizeFirstChar(str) { - if (!str || str.length === 0) { - return str; - } - return str.charAt(0).toUpperCase() + str.slice(1); -} - -function updateQuestionSelect(question_id) { - const select = document.getElementById('question-select'); - // Clear the question select. - select.innerHTML = ''; - // Populate the question select. - category = questionMapping[question_id].category; - categoryMapping[category].forEach(question_id => { - const question = questionMapping[question_id]; - const option = document.createElement('option'); - option.value = question_id; - option.textContent = 'Q' + question_id.toString() + ': ' + question.question; - select.appendChild(option); - }); - select.value = question_id; -} - -function updateModelSelect() { - const select = document.getElementById('model-select'); - img_path = modelFigureMapping[select.value]; - document.getElementById('other-model-figure').src = img_path; -} - -function populateModels(models) { - const select = document.getElementById('model-select'); - models.forEach(model => { - const option = document.createElement('option'); - option.value = model; - option.textContent = modelNameMapping[model]; - select.appendChild(option); - }); - updateModelSelect(); -} - -function populateQuestions(questions) { - const category_select = document.getElementById('category-select'); - - questionsCount = questions.length; - questions.forEach(question => { - const option = document.createElement('option'); - // Store the question data in a mapping for later use. - questionMapping[question.id] = { - category: question.category, - question: question.question, - answers: question.answers, - evaluations: question.evaluations, - scores: question.scores, - }; - // Store the question id in the category mapping. - if (question.category in categoryMapping) { - categoryMapping[question.category].push(question.id); - } else { - categoryMapping[question.category] = [question.id]; - const category_option = document.createElement('option'); - category_option.value = question.category; - category_option.textContent = capitalizeFirstChar(question.category); - category_select.appendChild(category_option); - } - }); - // Set the default category. - updateQuestionSelect(currentQuestionIndex); -} - -function displayQuestion(index) { - const question = questionMapping[index].question; - document.getElementById('selected-question').innerHTML = text2Markdown('**Question:** ' + question); - displayAnswers(index); -} - -function displayAnswers(index) { - const question = questionMapping[index]; - const otherModel = document.getElementById('model-select').value; - // render the answers with markdown - document.getElementById('other-model-answer').innerHTML = text2Markdown(question.answers[otherModel]); - document.getElementById('our-model-answer').innerHTML = text2Markdown(question.answers.vicuna); - - // Display evaluation - score = question.scores[otherModel]; - score_text = modelNameMapping[otherModel] + " " + score[0] + "/10, Vicuna-13b " + score[1] + "/10"; - document.getElementById('evaluation-header').textContent = "GPT-4 Evaluation" + " (Score: " + score_text + ")"; - document.getElementById('evaluation-result').innerHTML = text2Markdown(question.evaluations[otherModel]); - - // Update model names - let assistant1_title = "Assistant #1"; // (" + modelNameMapping[otherModel] + ")"; - let assistant2_title = "Assistant #2 (Vicuna-13b, our model)"; - // Update scores/labels. - let assistant1_score_label = score[0].toString() + '/10'; - let assistant2_score_label = score[1].toString() + '/10'; - - const colorRed ='#fa9'; // '#eb978d'; - // const colorGreen = '#c9f2c9'; - const colorBlue = '#8ef'; // '#71dbf9'; - const colorYellow = '#fe7'; // '#fada57'; - let otherModelHeaderColor = ''; - let ourModelHeaderColor = ''; - // Update the winner. - if (score[0] == score[1]) { - assistant1_title = '🏆 ' + assistant1_title; - assistant1_score_label = '🏆 ' + assistant1_score_label; - assistant2_title = '🏆 ' + assistant2_title; - assistant2_score_label = '🏆 ' + assistant2_score_label; - otherModelHeaderColor = colorYellow; - ourModelHeaderColor = colorYellow; - } else if (score[0] > score[1]) { - assistant1_title = '🏆 ' + assistant1_title; - assistant1_score_label = '🏆 ' + assistant1_score_label; - otherModelHeaderColor = colorBlue; - ourModelHeaderColor = colorRed; - } else if (score[0] < score[1]) { - assistant2_title = '🏆 ' + assistant2_title; - assistant2_score_label = '🏆 ' + assistant2_score_label; - otherModelHeaderColor = colorRed; - ourModelHeaderColor = colorBlue; - } - - document.getElementById('other-model-header-bg').style.backgroundColor = otherModelHeaderColor; - document.getElementById('our-model-header').style.backgroundColor = ourModelHeaderColor; - - document.getElementById('other-model-header').textContent = assistant1_title; - document.getElementById('our-model-header').textContent = assistant2_title; - - document.getElementById('other-score-label').textContent = assistant1_score_label; - document.getElementById('our-score-label').textContent = assistant2_score_label; - - // Update expand buttons visibility for both cards after displaying answers - // Reset the expanded state and update expand buttons visibility for both cards after displaying answers - document.querySelectorAll('.expandable-card').forEach(card => { - card.classList.remove('expanded'); - updateExpandButtonVisibility(card); - const expandBtn = card.querySelector('.expand-btn'); - expandBtn.innerHTML = 'keyboard_arrow_down Show more'; // .textContent = 'Show more'; - }); -} - -document.getElementById('question-select').addEventListener('change', e => { - currentQuestionIndex = parseInt(e.target.value); - displayQuestion(currentQuestionIndex); -}); - -document.getElementById('category-select').addEventListener('change', e => { - let currentCategory = e.target.value; - const questionIds = categoryMapping[currentCategory]; - currentQuestionIndex = questionIds[0]; - updateQuestionSelect(currentQuestionIndex); - displayQuestion(currentQuestionIndex); -}); - -// Update expand buttons whenever the model is changed -document.getElementById('model-select').addEventListener('change', () => { - displayAnswers(currentQuestionIndex); - document.querySelectorAll('.expandable-card').forEach(card => { - updateExpandButtonVisibility(card); - }); - updateModelSelect(); -}); - -function switchQuestionAndCategory() { - document.getElementById('question-select').value = currentQuestionIndex; - old_category = document.getElementById('category-select').value; - new_category = questionMapping[currentQuestionIndex].category; - if (old_category != new_category) { - document.getElementById('category-select').value = new_category; - updateQuestionSelect(currentQuestionIndex); - } - displayQuestion(currentQuestionIndex); -} - -document.getElementById('prev-question').addEventListener('click', () => { - // Question index starts from 1. - currentQuestionIndex = Math.max(1, currentQuestionIndex - 1); - switchQuestionAndCategory(); -}); - -document.getElementById('next-question').addEventListener('click', () => { - // Question index starts from 1. - currentQuestionIndex = Math.min(questionsCount, currentQuestionIndex + 1); - switchQuestionAndCategory(); -}); - -function updateExpandButtonVisibility(card) { - const cardTextContainer = card.querySelector('.card-text-container'); - const expandBtn = card.querySelector('.expand-btn'); - if (cardTextContainer.scrollHeight > cardTextContainer.offsetHeight) { - expandBtn.style.display = 'flex'; - } else { - expandBtn.style.display = 'none'; - card.classList.add('expanded'); - } -} - -document.querySelectorAll('.expand-btn').forEach(btn => { - btn.addEventListener('click', e => { - const card = e.target.closest('.expandable-card'); - card.classList.toggle('expanded'); - const more = 'keyboard_arrow_down Show more'; - const less = 'keyboard_arrow_up Show less'; - e.target.innerHTML = card.classList.contains('expanded') ? less : more; - }); -}); diff --git a/LLAVA_Biovil/llava/eval/webpage/styles.css b/LLAVA_Biovil/llava/eval/webpage/styles.css deleted file mode 100644 index 7b6d6fc69b336c0a5d103be9fb13a0e0897c76a3..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/eval/webpage/styles.css +++ /dev/null @@ -1,105 +0,0 @@ -body { - font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; - background-color: #f8f9fa; -} - -.navbar-dark .navbar-nav .nav-link { - color: #f1cf68; - font-size: 1.1rem; - padding: 0.5rem 0.6rem; -} - -.card-header { - font-weight: bold; -} - -.card { - box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); - transition: 0.3s; -} - -.card:hover { - box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2); -} - -button { - transition: background-color 0.3s; -} - -button:hover { - background-color: #007bff; -} - -@media (max-width: 767px) { - .form-row .form-group { - margin-bottom: 10px; - } -} - -/* Extra styles */ - -.expandable-card .card-text-container { - max-height: 200px; - overflow-y: hidden; - position: relative; -} - -.expandable-card.expanded .card-text-container { - max-height: none; -} - -.expand-btn { - position: relative; - display: none; - background-color: rgba(255, 255, 255, 0.8); - color: #510c75; - border-color: transparent; -} - -.expand-btn:hover { - background-color: rgba(200, 200, 200, 0.8); - text-decoration: none; - border-color: transparent; - color: #510c75; -} - -.expand-btn:focus { - outline: none; - text-decoration: none; -} - -.expandable-card:not(.expanded) .card-text-container:after { - content: ""; - position: absolute; - bottom: 0; - left: 0; - width: 100%; - height: 90px; - background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1)); -} - -.expandable-card:not(.expanded) .expand-btn { - margin-top: -40px; -} - -.card-body { - padding-bottom: 5px; -} - -.vertical-flex-layout { - justify-content: center; - align-items: center; - height: 100%; - display: flex; - flex-direction: column; - gap: 5px; -} - -.figure-img { - max-width: 100%; - height: auto; -} - -.adjustable-font-size { - font-size: calc(0.5rem + 2vw); -} diff --git a/LLAVA_Biovil/llava/mm_utils.py b/LLAVA_Biovil/llava/mm_utils.py index 8f373c7a181ea8bad406e323314d03d321581e74..0d5a7e00d08961880e5cd86d01a89dac3bdcd647 100644 --- a/LLAVA_Biovil/llava/mm_utils.py +++ b/LLAVA_Biovil/llava/mm_utils.py @@ -5,7 +5,7 @@ import torch from transformers import StoppingCriteria -from llava.constants import IMAGE_TOKEN_INDEX +from LLAVA_Biovil.llava.constants import IMAGE_TOKEN_INDEX def load_image_from_base64(image): diff --git a/LLAVA_Biovil/llava/model/apply_delta.py b/LLAVA_Biovil/llava/model/apply_delta.py index 666dd9691bde7d54ddf2871e311d6f621e29f099..c8c9a5a34d737e14c659f28ec822892294885c4f 100644 --- a/LLAVA_Biovil/llava/model/apply_delta.py +++ b/LLAVA_Biovil/llava/model/apply_delta.py @@ -7,7 +7,7 @@ import torch from tqdm import tqdm from transformers import AutoTokenizer, AutoModelForCausalLM -from llava import LlavaLlamaForCausalLM +from LLAVA_Biovil.llava import LlavaLlamaForCausalLM def apply_delta(base_model_path, target_model_path, delta_path): diff --git a/LLAVA_Biovil/llava/model/builder.py b/LLAVA_Biovil/llava/model/builder.py index 2750f6297b2b84d6deb7767a64d1021be815c5a1..632c944c3e92d1ed7585bc22a8a7a5ba06a96be5 100644 --- a/LLAVA_Biovil/llava/model/builder.py +++ b/LLAVA_Biovil/llava/model/builder.py @@ -20,17 +20,17 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig import torch -from LLAVA.biovil_t.model import ImageModel -from LLAVA.biovil_t.pretrained import _download_biovil_t_image_model_weights -from LLAVA.biovil_t.types import ImageEncoderType -from LLAVA.llava.model.multimodal_projector.builder import build_vision_projector +from LLAVA_Biovil.biovil_t.model import ImageModel +from LLAVA_Biovil.biovil_t.pretrained import _download_biovil_t_image_model_weights +from LLAVA_Biovil.biovil_t.types import ImageEncoderType +from LLAVA_Biovil.llava.model.multimodal_projector.builder import build_vision_projector try: - from LLAVA.llava.model import * - from LLAVA.llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + from LLAVA_Biovil.llava.model import * + from LLAVA_Biovil.llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN except: - from llava.model import * - from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + from LLAVA_Biovil.llava.model import * + from LLAVA_Biovil.llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", **kwargs): diff --git a/LLAVA_Biovil/llava/model/consolidate.py b/LLAVA_Biovil/llava/model/consolidate.py index 18894ba4403ff4ce188d26cf6f8d2322c3a0af0f..a07b085a0a83391bf2f62f51142c95fc190b6f62 100644 --- a/LLAVA_Biovil/llava/model/consolidate.py +++ b/LLAVA_Biovil/llava/model/consolidate.py @@ -6,7 +6,7 @@ import torch from transformers import AutoTokenizer, AutoModelForCausalLM -from LLAV.llava.model.utils import auto_upgrade +from LLAVA_Biovil.llava.model.utils import auto_upgrade def consolidate_ckpt(src_path, dst_path): diff --git a/LLAVA_Biovil/llava/model/language_model/llava_llama.py b/LLAVA_Biovil/llava/model/language_model/llava_llama.py index 0c4d323f07be6ce5e8f12d9ee1944e339a51f3ac..c0daaa44c147529378c9f0c7f8474ce9e82456b1 100644 --- a/LLAVA_Biovil/llava/model/language_model/llava_llama.py +++ b/LLAVA_Biovil/llava/model/language_model/llava_llama.py @@ -25,7 +25,7 @@ from transformers.modeling_outputs import CausalLMOutputWithPast -from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM +from LLAVA_Biovil.llava.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM class LlavaConfig(LlamaConfig): diff --git a/LLAVA_Biovil/llava/model/language_model/llava_mpt.py b/LLAVA_Biovil/llava/model/language_model/llava_mpt.py index 60c920bcc37fa9121fe4cf2a7825f85d1095177c..2a68bce3b047d06e83bf0621da08d10e0aecb074 100644 --- a/LLAVA_Biovil/llava/model/language_model/llava_mpt.py +++ b/LLAVA_Biovil/llava/model/language_model/llava_mpt.py @@ -23,8 +23,8 @@ from transformers import AutoConfig, AutoModelForCausalLM from transformers.modeling_outputs import CausalLMOutputWithPast -from .mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel -from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM +from LLAVA_Biovil.llava.model.language_model.mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel +from LLAVA_Biovil.llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM class LlavaMPTConfig(MPTConfig): diff --git a/LLAVA_Biovil/llava/model/llava_arch.py b/LLAVA_Biovil/llava/model/llava_arch.py index 6c7c394b1665a7240995d26aac82f756b18a5e02..ea54d53dca45415918b47b2f1be279b5eccc3b92 100644 --- a/LLAVA_Biovil/llava/model/llava_arch.py +++ b/LLAVA_Biovil/llava/model/llava_arch.py @@ -15,13 +15,13 @@ import torch -from biovil_t.model import ImageModel -from biovil_t.pretrained import _download_biovil_t_image_model_weights -from biovil_t.types import ImageEncoderType -from .multimodal_encoder.builder import build_vision_tower -from .multimodal_projector.builder import build_vision_projector, build_image_pooler +from LLAVA_Biovil.biovil_t.model import ImageModel +from LLAVA_Biovil.biovil_t.pretrained import _download_biovil_t_image_model_weights +from LLAVA_Biovil.biovil_t.types import ImageEncoderType +from LLAVA_Biovil.llava.multimodal_encoder.builder import build_vision_tower +from LLAVA_Biovil.llava.multimodal_projector.builder import build_vision_projector, build_image_pooler -from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from LLAVA_Biovil.llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN diff --git a/LLAVA_Biovil/llava/serve/__init__.py b/LLAVA_Biovil/llava/serve/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/LLAVA_Biovil/llava/serve/cli.py b/LLAVA_Biovil/llava/serve/cli.py deleted file mode 100644 index c3b1fa5b8bd35aaccac7902763cb3a16f6dbab8f..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/serve/cli.py +++ /dev/null @@ -1,122 +0,0 @@ -import argparse -import torch - -from LLAV.llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from LLAV.llava.conversation import conv_templates, SeparatorStyle -from LLAV.llava.model.builder import load_pretrained_model -from LLAV.llava.utils import disable_torch_init -from LLAV.llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria - -import requests -from PIL import Image -from io import BytesIO -from transformers import TextStreamer - - -def load_image(image_file): - if image_file.startswith('http://') or image_file.startswith('https://'): - response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert('RGB') - else: - image = Image.open(image_file).convert('RGB') - return image - - -def main(args): - # Model - disable_torch_init() - - model_name = get_model_name_from_path(args.model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device) - - if 'llama-2' in model_name.lower(): - conv_mode = "llava_llama_2" - elif "v1" in model_name.lower(): - conv_mode = "llava_v1" - elif "mpt" in model_name.lower(): - conv_mode = "mpt" - else: - conv_mode = "llava_v0" - - if args.conv_mode is not None and conv_mode != args.conv_mode: - print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode)) - else: - args.conv_mode = conv_mode - - conv = conv_templates[args.conv_mode].copy() - if "mpt" in model_name.lower(): - roles = ('user', 'assistant') - else: - roles = conv.roles - - image = load_image(args.image_file) - # Similar operation in model_worker.py - image_tensor = process_images([image], image_processor, model.config) - if type(image_tensor) is list: - image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor] - else: - image_tensor = image_tensor.to(model.device, dtype=torch.float16) - - while True: - try: - inp = input(f"{roles[0]}: ") - except EOFError: - inp = "" - if not inp: - print("exit...") - break - - print(f"{roles[1]}: ", end="") - - if image is not None: - # first message - if model.config.mm_use_im_start_end: - inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp - else: - inp = DEFAULT_IMAGE_TOKEN + '\n' + inp - conv.append_message(conv.roles[0], inp) - image = None - else: - # later messages - conv.append_message(conv.roles[0], inp) - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device) - stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 - keywords = [stop_str] - stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) - streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) - - with torch.inference_mode(): - output_ids = model.generate( - input_ids, - images=image_tensor, - do_sample=True if args.temperature > 0 else False, - temperature=args.temperature, - max_new_tokens=args.max_new_tokens, - streamer=streamer, - use_cache=True, - stopping_criteria=[stopping_criteria]) - - outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip() - conv.messages[-1][-1] = outputs - - if args.debug: - print("\n", {"prompt": prompt, "outputs": outputs}, "\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default="facebook/opt-350m") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image-file", type=str, required=True) - parser.add_argument("--device", type=str, default="cuda") - parser.add_argument("--conv-mode", type=str, default=None) - parser.add_argument("--temperature", type=float, default=0.2) - parser.add_argument("--max-new-tokens", type=int, default=512) - parser.add_argument("--load-8bit", action="store_true") - parser.add_argument("--load-4bit", action="store_true") - parser.add_argument("--debug", action="store_true") - args = parser.parse_args() - main(args) diff --git a/LLAVA_Biovil/llava/serve/controller.py b/LLAVA_Biovil/llava/serve/controller.py deleted file mode 100644 index 56beb82705aa669e2411eed2c2bdeff520fe1392..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/serve/controller.py +++ /dev/null @@ -1,296 +0,0 @@ -""" -A controller manages distributed workers. -It sends worker addresses to clients. -""" -import argparse -import dataclasses -from enum import Enum, auto -import json -import time -from typing import List -import threading - -from fastapi import FastAPI, Request -from fastapi.responses import StreamingResponse -import numpy as np -import requests -import uvicorn - -from llava.constants import CONTROLLER_HEART_BEAT_EXPIRATION -from llava.utils import build_logger, server_error_msg - - -logger = build_logger("controller", "controller.log") - - -class DispatchMethod(Enum): - LOTTERY = auto() - SHORTEST_QUEUE = auto() - - @classmethod - def from_str(cls, name): - if name == "lottery": - return cls.LOTTERY - elif name == "shortest_queue": - return cls.SHORTEST_QUEUE - else: - raise ValueError(f"Invalid dispatch method") - - -@dataclasses.dataclass -class WorkerInfo: - model_names: List[str] - speed: int - queue_length: int - check_heart_beat: bool - last_heart_beat: str - - -def heart_beat_controller(controller): - while True: - time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION) - controller.remove_stable_workers_by_expiration() - - -class Controller: - def __init__(self, dispatch_method: str): - # Dict[str -> WorkerInfo] - self.worker_info = {} - self.dispatch_method = DispatchMethod.from_str(dispatch_method) - - self.heart_beat_thread = threading.Thread( - target=heart_beat_controller, args=(self,)) - self.heart_beat_thread.start() - - logger.info("Init controller") - - def register_worker(self, worker_name: str, check_heart_beat: bool, - worker_status: dict): - if worker_name not in self.worker_info: - logger.info(f"Register a new worker: {worker_name}") - else: - logger.info(f"Register an existing worker: {worker_name}") - - if not worker_status: - worker_status = self.get_worker_status(worker_name) - if not worker_status: - return False - - self.worker_info[worker_name] = WorkerInfo( - worker_status["model_names"], worker_status["speed"], worker_status["queue_length"], - check_heart_beat, time.time()) - - logger.info(f"Register done: {worker_name}, {worker_status}") - return True - - def get_worker_status(self, worker_name: str): - try: - r = requests.post(worker_name + "/worker_get_status", timeout=5) - except requests.exceptions.RequestException as e: - logger.error(f"Get status fails: {worker_name}, {e}") - return None - - if r.status_code != 200: - logger.error(f"Get status fails: {worker_name}, {r}") - return None - - return r.json() - - def remove_worker(self, worker_name: str): - del self.worker_info[worker_name] - - def refresh_all_workers(self): - old_info = dict(self.worker_info) - self.worker_info = {} - - for w_name, w_info in old_info.items(): - if not self.register_worker(w_name, w_info.check_heart_beat, None): - logger.info(f"Remove stale worker: {w_name}") - - def list_models(self): - model_names = set() - - for w_name, w_info in self.worker_info.items(): - model_names.update(w_info.model_names) - - return list(model_names) - - def get_worker_address(self, model_name: str): - if self.dispatch_method == DispatchMethod.LOTTERY: - worker_names = [] - worker_speeds = [] - for w_name, w_info in self.worker_info.items(): - if model_name in w_info.model_names: - worker_names.append(w_name) - worker_speeds.append(w_info.speed) - worker_speeds = np.array(worker_speeds, dtype=np.float32) - norm = np.sum(worker_speeds) - if norm < 1e-4: - return "" - worker_speeds = worker_speeds / norm - if True: # Directly return address - pt = np.random.choice(np.arange(len(worker_names)), - p=worker_speeds) - worker_name = worker_names[pt] - return worker_name - - # Check status before returning - while True: - pt = np.random.choice(np.arange(len(worker_names)), - p=worker_speeds) - worker_name = worker_names[pt] - - if self.get_worker_status(worker_name): - break - else: - self.remove_worker(worker_name) - worker_speeds[pt] = 0 - norm = np.sum(worker_speeds) - if norm < 1e-4: - return "" - worker_speeds = worker_speeds / norm - continue - return worker_name - elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE: - worker_names = [] - worker_qlen = [] - for w_name, w_info in self.worker_info.items(): - if model_name in w_info.model_names: - worker_names.append(w_name) - worker_qlen.append(w_info.queue_length / w_info.speed) - if len(worker_names) == 0: - return "" - min_index = np.argmin(worker_qlen) - w_name = worker_names[min_index] - self.worker_info[w_name].queue_length += 1 - logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}") - return w_name - else: - raise ValueError(f"Invalid dispatch method: {self.dispatch_method}") - - def receive_heart_beat(self, worker_name: str, queue_length: int): - if worker_name not in self.worker_info: - logger.info(f"Receive unknown heart beat. {worker_name}") - return False - - self.worker_info[worker_name].queue_length = queue_length - self.worker_info[worker_name].last_heart_beat = time.time() - logger.info(f"Receive heart beat. {worker_name}") - return True - - def remove_stable_workers_by_expiration(self): - expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION - to_delete = [] - for worker_name, w_info in self.worker_info.items(): - if w_info.check_heart_beat and w_info.last_heart_beat < expire: - to_delete.append(worker_name) - - for worker_name in to_delete: - self.remove_worker(worker_name) - - def worker_api_generate_stream(self, params): - worker_addr = self.get_worker_address(params["model"]) - if not worker_addr: - logger.info(f"no worker: {params['model']}") - ret = { - "text": server_error_msg, - "error_code": 2, - } - yield json.dumps(ret).encode() + b"\0" - - try: - response = requests.post(worker_addr + "/worker_generate_stream", - json=params, stream=True, timeout=5) - for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"): - if chunk: - yield chunk + b"\0" - except requests.exceptions.RequestException as e: - logger.info(f"worker timeout: {worker_addr}") - ret = { - "text": server_error_msg, - "error_code": 3, - } - yield json.dumps(ret).encode() + b"\0" - - - # Let the controller act as a worker to achieve hierarchical - # management. This can be used to connect isolated sub networks. - def worker_api_get_status(self): - model_names = set() - speed = 0 - queue_length = 0 - - for w_name in self.worker_info: - worker_status = self.get_worker_status(w_name) - if worker_status is not None: - model_names.update(worker_status["model_names"]) - speed += worker_status["speed"] - queue_length += worker_status["queue_length"] - - return { - "model_names": list(model_names), - "speed": speed, - "queue_length": queue_length, - } - - -app = FastAPI() - - -@app.post("/register_worker") -async def register_worker(request: Request): - data = await request.json() - controller.register_worker( - data["worker_name"], data["check_heart_beat"], - data.get("worker_status", None)) - - -@app.post("/refresh_all_workers") -async def refresh_all_workers(): - models = controller.refresh_all_workers() - - -@app.post("/list_models") -async def list_models(): - models = controller.list_models() - return {"models": models} - - -@app.post("/get_worker_address") -async def get_worker_address(request: Request): - data = await request.json() - addr = controller.get_worker_address(data["model"]) - return {"address": addr} - - -@app.post("/receive_heart_beat") -async def receive_heart_beat(request: Request): - data = await request.json() - exist = controller.receive_heart_beat( - data["worker_name"], data["queue_length"]) - return {"exist": exist} - - -@app.post("/worker_generate_stream") -async def worker_api_generate_stream(request: Request): - params = await request.json() - generator = controller.worker_api_generate_stream(params) - return StreamingResponse(generator) - - -@app.post("/worker_get_status") -async def worker_api_get_status(request: Request): - return controller.worker_api_get_status() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=21001) - parser.add_argument("--dispatch-method", type=str, choices=[ - "lottery", "shortest_queue"], default="shortest_queue") - args = parser.parse_args() - logger.info(f"args: {args}") - - controller = Controller(args.dispatch_method) - uvicorn.run(app, host=args.host, port=args.port, log_level="info") diff --git a/LLAVA_Biovil/llava/serve/examples/extreme_ironing.jpg b/LLAVA_Biovil/llava/serve/examples/extreme_ironing.jpg deleted file mode 100644 index 638b078837f175039b2db49a63821288d9681daa..0000000000000000000000000000000000000000 Binary files a/LLAVA_Biovil/llava/serve/examples/extreme_ironing.jpg and /dev/null differ diff --git a/LLAVA_Biovil/llava/serve/examples/waterview.jpg b/LLAVA_Biovil/llava/serve/examples/waterview.jpg deleted file mode 100644 index 6f44ebaba1aa493b8bab3baa4e827b76752b1869..0000000000000000000000000000000000000000 Binary files a/LLAVA_Biovil/llava/serve/examples/waterview.jpg and /dev/null differ diff --git a/LLAVA_Biovil/llava/serve/gradio_web_server.py b/LLAVA_Biovil/llava/serve/gradio_web_server.py deleted file mode 100644 index 66661861690e0313c32de8b5b5175c5c807d13fe..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/serve/gradio_web_server.py +++ /dev/null @@ -1,470 +0,0 @@ -import argparse -import datetime -import json -import os -import time - -import gradio as gr -import requests - -from llava.conversation import (default_conversation, conv_templates, - SeparatorStyle) -from llava.constants import LOGDIR -from llava.utils import (build_logger, server_error_msg, - violates_moderation, moderation_msg) -import hashlib - - -logger = build_logger("gradio_web_server", "gradio_web_server.log") - -headers = {"User-Agent": "LLaVA Client"} - -no_change_btn = gr.Button.update() -enable_btn = gr.Button.update(interactive=True) -disable_btn = gr.Button.update(interactive=False) - -priority = { - "vicuna-13b": "aaaaaaa", - "koala-13b": "aaaaaab", -} - - -def get_conv_log_filename(): - t = datetime.datetime.now() - name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json") - return name - - -def get_model_list(): - ret = requests.post(args.controller_url + "/refresh_all_workers") - assert ret.status_code == 200 - ret = requests.post(args.controller_url + "/list_models") - models = ret.json()["models"] - models.sort(key=lambda x: priority.get(x, x)) - logger.info(f"Models: {models}") - return models - - -get_window_url_params = """ -function() { - const params = new URLSearchParams(window.location.search); - url_params = Object.fromEntries(params); - console.log(url_params); - return url_params; - } -""" - - -def load_demo(url_params, request: gr.Request): - logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}") - - dropdown_update = gr.Dropdown.update(visible=True) - if "model" in url_params: - model = url_params["model"] - if model in models: - dropdown_update = gr.Dropdown.update( - value=model, visible=True) - - state = default_conversation.copy() - return state, dropdown_update - - -def load_demo_refresh_model_list(request: gr.Request): - logger.info(f"load_demo. ip: {request.client.host}") - models = get_model_list() - state = default_conversation.copy() - dropdown_update = gr.Dropdown.update( - choices=models, - value=models[0] if len(models) > 0 else "" - ) - return state, dropdown_update - - -def vote_last_response(state, vote_type, model_selector, request: gr.Request): - with open(get_conv_log_filename(), "a") as fout: - data = { - "tstamp": round(time.time(), 4), - "type": vote_type, - "model": model_selector, - "state": state.dict(), - "ip": request.client.host, - } - fout.write(json.dumps(data) + "\n") - - -def upvote_last_response(state, model_selector, request: gr.Request): - logger.info(f"upvote. ip: {request.client.host}") - vote_last_response(state, "upvote", model_selector, request) - return ("",) + (disable_btn,) * 3 - - -def downvote_last_response(state, model_selector, request: gr.Request): - logger.info(f"downvote. ip: {request.client.host}") - vote_last_response(state, "downvote", model_selector, request) - return ("",) + (disable_btn,) * 3 - - -def flag_last_response(state, model_selector, request: gr.Request): - logger.info(f"flag. ip: {request.client.host}") - vote_last_response(state, "flag", model_selector, request) - return ("",) + (disable_btn,) * 3 - - -def regenerate(state, image_process_mode, request: gr.Request): - logger.info(f"regenerate. ip: {request.client.host}") - state.messages[-1][-1] = None - prev_human_msg = state.messages[-2] - if type(prev_human_msg[1]) in (tuple, list): - prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode) - state.skip_next = False - return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5 - - -def clear_history(request: gr.Request): - logger.info(f"clear_history. ip: {request.client.host}") - state = default_conversation.copy() - return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5 - - -def add_text(state, text, image, image_process_mode, request: gr.Request): - logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}") - if len(text) <= 0 and image is None: - state.skip_next = True - return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5 - if args.moderate: - flagged = violates_moderation(text) - if flagged: - state.skip_next = True - return (state, state.to_gradio_chatbot(), moderation_msg, None) + ( - no_change_btn,) * 5 - - text = text[:1536] # Hard cut-off - if image is not None: - text = text[:1200] # Hard cut-off for images - if '' not in text: - # text = '' + text - text = text + '\n' - text = (text, image, image_process_mode) - if len(state.get_images(return_pil=True)) > 0: - state = default_conversation.copy() - state.append_message(state.roles[0], text) - state.append_message(state.roles[1], None) - state.skip_next = False - return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5 - - -def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request): - logger.info(f"http_bot. ip: {request.client.host}") - start_tstamp = time.time() - model_name = model_selector - - if state.skip_next: - # This generate call is skipped due to invalid inputs - yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5 - return - - if len(state.messages) == state.offset + 2: - # First round of conversation - if "llava" in model_name.lower(): - if 'llama-2' in model_name.lower(): - template_name = "llava_llama_2" - elif "v1" in model_name.lower(): - if 'mmtag' in model_name.lower(): - template_name = "v1_mmtag" - elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower(): - template_name = "v1_mmtag" - else: - template_name = "llava_v1" - elif "mpt" in model_name.lower(): - template_name = "mpt" - else: - if 'mmtag' in model_name.lower(): - template_name = "v0_mmtag" - elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower(): - template_name = "v0_mmtag" - else: - template_name = "llava_v0" - elif "mpt" in model_name: - template_name = "mpt_text" - elif "llama-2" in model_name: - template_name = "llama_2" - else: - template_name = "vicuna_v1" - new_state = conv_templates[template_name].copy() - new_state.append_message(new_state.roles[0], state.messages[-2][1]) - new_state.append_message(new_state.roles[1], None) - state = new_state - - # Query worker address - controller_url = args.controller_url - ret = requests.post(controller_url + "/get_worker_address", - json={"model": model_name}) - worker_addr = ret.json()["address"] - logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}") - - # No available worker - if worker_addr == "": - state.messages[-1][-1] = server_error_msg - yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn) - return - - # Construct prompt - prompt = state.get_prompt() - - all_images = state.get_images(return_pil=True) - all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images] - for image, hash in zip(all_images, all_image_hash): - t = datetime.datetime.now() - filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg") - if not os.path.isfile(filename): - os.makedirs(os.path.dirname(filename), exist_ok=True) - image.save(filename) - - # Make requests - pload = { - "model": model_name, - "prompt": prompt, - "temperature": float(temperature), - "top_p": float(top_p), - "max_new_tokens": min(int(max_new_tokens), 1536), - "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2, - "images": f'List of {len(state.get_images())} images: {all_image_hash}', - } - logger.info(f"==== request ====\n{pload}") - - pload['images'] = state.get_images() - - state.messages[-1][-1] = "▌" - yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5 - - try: - # Stream output - response = requests.post(worker_addr + "/worker_generate_stream", - headers=headers, json=pload, stream=True, timeout=10) - for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"): - if chunk: - data = json.loads(chunk.decode()) - if data["error_code"] == 0: - output = data["text"][len(prompt):].strip() - state.messages[-1][-1] = output + "▌" - yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5 - else: - output = data["text"] + f" (error_code: {data['error_code']})" - state.messages[-1][-1] = output - yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn) - return - time.sleep(0.03) - except requests.exceptions.RequestException as e: - state.messages[-1][-1] = server_error_msg - yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn) - return - - state.messages[-1][-1] = state.messages[-1][-1][:-1] - yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5 - - finish_tstamp = time.time() - logger.info(f"{output}") - - with open(get_conv_log_filename(), "a") as fout: - data = { - "tstamp": round(finish_tstamp, 4), - "type": "chat", - "model": model_name, - "start": round(start_tstamp, 4), - "finish": round(finish_tstamp, 4), - "state": state.dict(), - "images": all_image_hash, - "ip": request.client.host, - } - fout.write(json.dumps(data) + "\n") - -title_markdown = (""" -# 🌋 LLaVA: Large Language and Vision Assistant -[[Project Page](https://llava-vl.github.io)] [[Code](https://github.com/haotian-liu/LLaVA)] [[Model](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)] | 📚 [[LLaVA](https://arxiv.org/abs/2304.08485)] [[LLaVA-v1.5](https://arxiv.org/abs/2310.03744)] -""") - -tos_markdown = (""" -### Terms of use -By using this service, users are required to agree to the following terms: -The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research. -Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator. -For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality. -""") - - -learn_more_markdown = (""" -### License -The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation. -""") - -block_css = """ - -#buttons button { - min-width: min(120px,100%); -} - -""" - -def build_demo(embed_mode): - textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False) - with gr.Blocks(title="LLaVA", theme=gr.themes.Default(), css=block_css) as demo: - state = gr.State() - - if not embed_mode: - gr.Markdown(title_markdown) - - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(elem_id="model_selector_row"): - model_selector = gr.Dropdown( - choices=models, - value=models[0] if len(models) > 0 else "", - interactive=True, - show_label=False, - container=False) - - imagebox = gr.Image(type="pil") - image_process_mode = gr.Radio( - ["Crop", "Resize", "Pad", "Default"], - value="Default", - label="Preprocess for non-square image", visible=False) - - cur_dir = os.path.dirname(os.path.abspath(__file__)) - gr.Examples(examples=[ - [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"], - [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"], - ], inputs=[imagebox, textbox]) - - with gr.Accordion("Parameters", open=False) as parameter_row: - temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",) - top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",) - max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",) - - with gr.Column(scale=8): - chatbot = gr.Chatbot(elem_id="chatbot", label="LLaVA Chatbot", height=550) - with gr.Row(): - with gr.Column(scale=8): - textbox.render() - with gr.Column(scale=1, min_width=50): - submit_btn = gr.Button(value="Send", variant="primary") - with gr.Row(elem_id="buttons") as button_row: - upvote_btn = gr.Button(value="👍 Upvote", interactive=False) - downvote_btn = gr.Button(value="👎 Downvote", interactive=False) - flag_btn = gr.Button(value="⚠️ Flag", interactive=False) - #stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False) - regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False) - clear_btn = gr.Button(value="🗑️ Clear", interactive=False) - - if not embed_mode: - gr.Markdown(tos_markdown) - gr.Markdown(learn_more_markdown) - url_params = gr.JSON(visible=False) - - # Register listeners - btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn] - upvote_btn.click( - upvote_last_response, - [state, model_selector], - [textbox, upvote_btn, downvote_btn, flag_btn], - queue=False - ) - downvote_btn.click( - downvote_last_response, - [state, model_selector], - [textbox, upvote_btn, downvote_btn, flag_btn], - queue=False - ) - flag_btn.click( - flag_last_response, - [state, model_selector], - [textbox, upvote_btn, downvote_btn, flag_btn], - queue=False - ) - - regenerate_btn.click( - regenerate, - [state, image_process_mode], - [state, chatbot, textbox, imagebox] + btn_list, - queue=False - ).then( - http_bot, - [state, model_selector, temperature, top_p, max_output_tokens], - [state, chatbot] + btn_list - ) - - clear_btn.click( - clear_history, - None, - [state, chatbot, textbox, imagebox] + btn_list, - queue=False - ) - - textbox.submit( - add_text, - [state, textbox, imagebox, image_process_mode], - [state, chatbot, textbox, imagebox] + btn_list, - queue=False - ).then( - http_bot, - [state, model_selector, temperature, top_p, max_output_tokens], - [state, chatbot] + btn_list - ) - - submit_btn.click( - add_text, - [state, textbox, imagebox, image_process_mode], - [state, chatbot, textbox, imagebox] + btn_list, - queue=False - ).then( - http_bot, - [state, model_selector, temperature, top_p, max_output_tokens], - [state, chatbot] + btn_list - ) - - if args.model_list_mode == "once": - demo.load( - load_demo, - [url_params], - [state, model_selector], - _js=get_window_url_params, - queue=False - ) - elif args.model_list_mode == "reload": - demo.load( - load_demo_refresh_model_list, - None, - [state, model_selector], - queue=False - ) - else: - raise ValueError(f"Unknown model list mode: {args.model_list_mode}") - - return demo - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default="0.0.0.0") - parser.add_argument("--port", type=int) - parser.add_argument("--controller-url", type=str, default="http://localhost:21001") - parser.add_argument("--concurrency-count", type=int, default=10) - parser.add_argument("--model-list-mode", type=str, default="once", - choices=["once", "reload"]) - parser.add_argument("--share", action="store_true") - parser.add_argument("--moderate", action="store_true") - parser.add_argument("--embed", action="store_true") - args = parser.parse_args() - logger.info(f"args: {args}") - - models = get_model_list() - - logger.info(args) - demo = build_demo(args.embed) - demo.queue( - concurrency_count=args.concurrency_count, - api_open=False - ).launch( - server_name=args.host, - server_port=args.port, - share=args.share - ) diff --git a/LLAVA_Biovil/llava/serve/model_worker.py b/LLAVA_Biovil/llava/serve/model_worker.py deleted file mode 100644 index 7633defc5646a9139eee0ed0472370ed5e405970..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/serve/model_worker.py +++ /dev/null @@ -1,310 +0,0 @@ -""" -A model worker executes the model. -""" -import argparse -import asyncio -import json -import time -import threading -import uuid - -from fastapi import FastAPI, Request, BackgroundTasks -from fastapi.responses import StreamingResponse -import requests -import torch -import uvicorn -from functools import partial - -from llava.constants import WORKER_HEART_BEAT_INTERVAL -from llava.utils import (build_logger, server_error_msg, - pretty_print_semaphore) -from llava.model.builder import load_pretrained_model -from llava.mm_utils import process_images, load_image_from_base64, tokenizer_image_token, KeywordsStoppingCriteria, process_image_biovil, \ - load_image_from_base64_biovil -from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from transformers import TextIteratorStreamer -from threading import Thread - -from torchvision.transforms import Compose, Resize, ToTensor, CenterCrop, transforms - -from test import ExpandChannels - -GB = 1 << 30 - -worker_id = str(uuid.uuid4())[:6] -logger = build_logger("model_worker", f"model_worker_{worker_id}.log") -global_counter = 0 - -model_semaphore = None - - -def heart_beat_worker(controller): - - while True: - time.sleep(WORKER_HEART_BEAT_INTERVAL) - controller.send_heart_beat() - - -class ModelWorker: - def __init__(self, controller_addr, worker_addr, - worker_id, no_register, - model_path, model_base, model_name, - load_8bit, load_4bit, device, vision_tower): - self.controller_addr = controller_addr - self.worker_addr = worker_addr - self.worker_id = worker_id - if model_path.endswith("/"): - model_path = model_path[:-1] - if model_name is None: - model_paths = model_path.split("/") - if model_paths[-1].startswith('checkpoint-'): - self.model_name = model_paths[-2] + "_" + model_paths[-1] - else: - self.model_name = model_paths[-1] - else: - self.model_name = model_name - - self.device = device - logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...") - self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model( - model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device) - self.is_multimodal = 'llava' in self.model_name.lower() - - if not no_register: - self.register_to_controller() - self.heart_beat_thread = threading.Thread( - target=heart_beat_worker, args=(self,)) - self.heart_beat_thread.start() - - self.vision_tower = vision_tower - self.vis_transforms_biovil = self.create_chest_xray_transform_for_inference(512, center_crop_size=448) - - def create_chest_xray_transform_for_inference(self, resize: int, center_crop_size: int) -> Compose: - """ - Defines the image transformation pipeline for Chest-Xray datasets. - - :param resize: The size to resize the image to. Linear resampling is used. - Resizing is applied on the axis with smaller shape. - :param center_crop_size: The size to center crop the image to. Square crop is applied. - """ - - transforms = [Resize(resize), CenterCrop(center_crop_size), ToTensor(), ExpandChannels()] - return Compose(transforms) - - def register_to_controller(self): - logger.info("Register to controller") - - url = self.controller_addr + "/register_worker" - data = { - "worker_name": self.worker_addr, - "check_heart_beat": True, - "worker_status": self.get_status() - } - r = requests.post(url, json=data) - assert r.status_code == 200 - - def send_heart_beat(self): - logger.info(f"Send heart beat. Models: {[self.model_name]}. " - f"Semaphore: {pretty_print_semaphore(model_semaphore)}. " - f"global_counter: {global_counter}") - - url = self.controller_addr + "/receive_heart_beat" - - while True: - try: - ret = requests.post(url, json={ - "worker_name": self.worker_addr, - "queue_length": self.get_queue_length()}, timeout=5) - exist = ret.json()["exist"] - break - except requests.exceptions.RequestException as e: - logger.error(f"heart beat error: {e}") - time.sleep(5) - - if not exist: - self.register_to_controller() - - def get_queue_length(self): - if model_semaphore is None: - return 0 - else: - return args.limit_model_concurrency - model_semaphore._value + (len( - model_semaphore._waiters) if model_semaphore._waiters is not None else 0) - - def get_status(self): - return { - "model_names": [self.model_name], - "speed": 1, - "queue_length": self.get_queue_length(), - } - - @torch.inference_mode() - def generate_stream(self, params): - tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor - - prompt = params["prompt"] - ori_prompt = prompt - images = params.get("images", None) - num_image_tokens = 0 - if images is not None and len(images) > 0 and self.is_multimodal: - if len(images) > 0: - if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN): - raise ValueError("Number of images does not match number of tokens in prompt") - - if self.vision_tower == 'biovil': - images = [load_image_from_base64_biovil(image) for image in images] - images = process_image_biovil(images, self.vis_transforms_biovil) - else: - images = [load_image_from_base64(image) for image in images] - images = process_images(images, image_processor, model.config) - - if type(images) is list: - images = [image.to(self.model.device, dtype=torch.bfloat16) for image in images] - else: - images = images.to(self.model.device, dtype=torch.bfloat16) - - replace_token = DEFAULT_IMAGE_TOKEN - if getattr(self.model.config, 'mm_use_im_start_end', False): - replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN - prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token) - - num_image_tokens = prompt.count(replace_token) * 196 if self.vision_tower == 'biovil' else prompt.count(replace_token) * model.get_vision_tower().num_patches - else: - images = None - image_args = {"images": images} - else: - images = None - image_args = {} - - temperature = float(params.get("temperature", 1.0)) - top_p = float(params.get("top_p", 1.0)) - max_context_length = getattr(model.config, 'max_position_embeddings', 2048) - max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024) - stop_str = params.get("stop", None) - do_sample = True if temperature > 0.001 else False - - input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device) - keywords = [stop_str] - stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) - streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15) - - max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens) - - if max_new_tokens < 1: - yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0" - return - - thread = Thread(target=model.generate, kwargs=dict( - inputs=input_ids, - do_sample=do_sample, - temperature=temperature, - top_p=top_p, - max_new_tokens=max_new_tokens, - streamer=streamer, - stopping_criteria=[stopping_criteria], - use_cache=True, - **image_args - )) - thread.start() - - generated_text = ori_prompt - for new_text in streamer: - generated_text += new_text - if generated_text.endswith(stop_str): - generated_text = generated_text[:-len(stop_str)] - yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0" - - def generate_stream_gate(self, params): - try: - for x in self.generate_stream(params): - yield x - except ValueError as e: - print("Caught ValueError:", e) - ret = { - "text": server_error_msg, - "error_code": 1, - } - yield json.dumps(ret).encode() + b"\0" - except torch.cuda.CudaError as e: - print("Caught torch.cuda.CudaError:", e) - ret = { - "text": server_error_msg, - "error_code": 1, - } - yield json.dumps(ret).encode() + b"\0" - except Exception as e: - print("Caught Unknown Error", e) - ret = { - "text": server_error_msg, - "error_code": 1, - } - yield json.dumps(ret).encode() + b"\0" - - -app = FastAPI() - - -def release_model_semaphore(fn=None): - model_semaphore.release() - if fn is not None: - fn() - - -@app.post("/worker_generate_stream") -async def generate_stream(request: Request): - global model_semaphore, global_counter - global_counter += 1 - params = await request.json() - - if model_semaphore is None: - model_semaphore = asyncio.Semaphore(args.limit_model_concurrency) - await model_semaphore.acquire() - worker.send_heart_beat() - generator = worker.generate_stream_gate(params) - background_tasks = BackgroundTasks() - background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat)) - return StreamingResponse(generator, background=background_tasks) - - -@app.post("/worker_get_status") -async def get_status(request: Request): - return worker.get_status() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=21002) - parser.add_argument("--worker-address", type=str, - default="http://localhost:21002") - parser.add_argument("--controller-address", type=str, - default="http://localhost:21001") - parser.add_argument("--model-path", type=str, default="facebook/opt-350m") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--model-name", type=str) - parser.add_argument("--device", type=str, default="cuda") - parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.") - parser.add_argument("--limit-model-concurrency", type=int, default=5) - parser.add_argument("--stream-interval", type=int, default=1) - parser.add_argument("--no-register", action="store_true") - parser.add_argument("--load-8bit", action="store_true") - parser.add_argument("--load-4bit", action="store_true") - parser.add_argument("--vision_tower", type=str, default="openai/clip-vit-large-patch14-336") - args = parser.parse_args() - logger.info(f"args: {args}") - - if args.multi_modal: - logger.warning("Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.") - - worker = ModelWorker(args.controller_address, - args.worker_address, - worker_id, - args.no_register, - args.model_path, - args.model_base, - args.model_name, - args.load_8bit, - args.load_4bit, - args.device, - args.vision_tower) - uvicorn.run(app, host=args.host, port=args.port, log_level="info") diff --git a/LLAVA_Biovil/llava/serve/register_worker.py b/LLAVA_Biovil/llava/serve/register_worker.py deleted file mode 100644 index 2c2c40295e0351f25709ba25554c9329f15bf0d2..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/serve/register_worker.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Manually register workers. - -Usage: -python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 -""" - -import argparse - -import requests - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--controller-address", type=str) - parser.add_argument("--worker-name", type=str) - parser.add_argument("--check-heart-beat", action="store_true") - args = parser.parse_args() - - url = args.controller_address + "/register_worker" - data = { - "worker_name": args.worker_name, - "check_heart_beat": args.check_heart_beat, - "worker_status": None, - } - r = requests.post(url, json=data) - assert r.status_code == 200 diff --git a/LLAVA_Biovil/llava/serve/test_message.py b/LLAVA_Biovil/llava/serve/test_message.py deleted file mode 100644 index f01c4edc40536408cc3411921aebacfd4e0b62de..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/serve/test_message.py +++ /dev/null @@ -1,62 +0,0 @@ -import argparse -import json - -import requests - -from LLAV.llava.conversation import default_conversation - - -def main(): - if args.worker_address: - worker_addr = args.worker_address - else: - controller_addr = args.controller_address - ret = requests.post(controller_addr + "/refresh_all_workers") - ret = requests.post(controller_addr + "/list_models") - models = ret.json()["models"] - models.sort() - print(f"Models: {models}") - - ret = requests.post(controller_addr + "/get_worker_address", - json={"model": args.model_name}) - worker_addr = ret.json()["address"] - print(f"worker_addr: {worker_addr}") - - if worker_addr == "": - return - - conv = default_conversation.copy() - conv.append_message(conv.roles[0], args.message) - prompt = conv.get_prompt() - - headers = {"User-Agent": "LLaVA Client"} - pload = { - "model": args.model_name, - "prompt": prompt, - "max_new_tokens": args.max_new_tokens, - "temperature": 0.7, - "stop": conv.sep, - } - response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, - json=pload, stream=True) - - print(prompt.replace(conv.sep, "\n"), end="") - for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): - if chunk: - data = json.loads(chunk.decode("utf-8")) - output = data["text"].split(conv.sep)[-1] - print(output, end="\r") - print("") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--controller-address", type=str, default="http://localhost:21001") - parser.add_argument("--worker-address", type=str) - parser.add_argument("--model-name", type=str, default="facebook/opt-350m") - parser.add_argument("--max-new-tokens", type=int, default=32) - parser.add_argument("--message", type=str, default= - "Tell me a story with more than 1000 words.") - args = parser.parse_args() - - main() diff --git a/LLAVA_Biovil/llava/train/__init__.py b/LLAVA_Biovil/llava/train/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/LLAVA_Biovil/llava/train/llama_flash_attn_monkey_patch.py b/LLAVA_Biovil/llava/train/llama_flash_attn_monkey_patch.py deleted file mode 100644 index 31db2eff8d1c4b3ae645583dfc5e156e818b6f1c..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/train/llama_flash_attn_monkey_patch.py +++ /dev/null @@ -1,115 +0,0 @@ -from typing import Optional, Tuple -import warnings - -import torch - -import transformers -from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv - -try: - from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func -except ImportError: - from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func -from flash_attn.bert_padding import unpad_input, pad_input - - -def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - warnings.warn( - "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead." - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = ( - self.q_proj(hidden_states) - .view(bsz, q_len, self.num_heads, self.head_dim) - .transpose(1, 2) - ) - key_states = ( - self.k_proj(hidden_states) - .view(bsz, q_len, self.num_key_value_heads, self.head_dim) - .transpose(1, 2) - ) - value_states = ( - self.v_proj(hidden_states) - .view(bsz, q_len, self.num_key_value_heads, self.head_dim) - .transpose(1, 2) - ) # shape: (b, num_heads, s, head_dim) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb( - query_states, key_states, cos, sin, position_ids - ) - - if past_key_value is not None: - # reuse k, v - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - # Transform the data into the format required by flash attention - qkv = torch.stack([query_states, key_states, value_states], dim=2) - qkv = qkv.transpose(1, 3) # shape: [b, s, 3, num_heads, head_dim] - key_padding_mask = attention_mask - - if key_padding_mask is None: - qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim) - cu_q_lens = torch.arange( - 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device - ) - max_s = q_len - output = flash_attn_unpadded_qkvpacked_func( - qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True - ) - output = output.view(bsz, q_len, -1) - else: - qkv = qkv.reshape(bsz, q_len, -1) - qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask) - qkv = qkv.view(-1, 3, self.num_heads, self.head_dim) - output_unpad = flash_attn_unpadded_qkvpacked_func( - qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True - ) - output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim) - output = pad_input(output_unpad, indices, bsz, q_len) - - return self.o_proj(output), None, past_key_value - - -# Disable the transformation of the attention mask in LlamaModel as the flash attention -# requires the attention mask to be the same as the key_padding_mask -def _prepare_decoder_attention_mask( - self, attention_mask, input_shape, inputs_embeds, past_key_values_length -): - # [bsz, seq_len] - return attention_mask - - -def replace_llama_attn_with_flash_attn(): - cuda_major, cuda_minor = torch.cuda.get_device_capability() - if cuda_major < 8: - warnings.warn( - "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward." - "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593" - ) - transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( - _prepare_decoder_attention_mask - ) - transformers.models.llama.modeling_llama.LlamaAttention.forward = forward diff --git a/LLAVA_Biovil/llava/train/llama_patch.py b/LLAVA_Biovil/llava/train/llama_patch.py deleted file mode 100644 index 00d6a7ee09a15c5e8c7cdc49fb89958eb29c03ff..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/train/llama_patch.py +++ /dev/null @@ -1,139 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -import warnings -import transformers -from transformers.models.llama.modeling_llama import apply_rotary_pos_emb -from peft.tuners.lora import LoraLayer - -try: - from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func - from flash_attn.bert_padding import unpad_input, pad_input -except Exception: - raise ModuleNotFoundError( - "Please install FlashAttention first, e.g., with pip install flash-attn --no-build-isolation, Learn more at https://github.com/Dao-AILab/flash-attention#installation-and-features" - ) - -try: - from einops import rearrange -except Exception: - raise ModuleNotFoundError("Please install einops first, e.g., with pip install einops") - - -# ADAPTED from https://github.com/allenai/open-instruct/blob/main/open_instruct/llama_flash_attn_monkey_patch.py -# AND https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py -# AND https://github.com/LAION-AI/Open-Assistant/blob/04fa9a24b2a58c8885b8aa6a2eb02b18de6b4961/model/model_training/models/patching_llama.py -# AND Sourabh https://github.com/huggingface/transformers/commit/ee81bf5aee0d65f005d157c013777e3d27d8d6bf -def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel - - attention_mask: [bsz, q_len] - """ - if output_attentions: - warnings.warn("Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.") - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # [bsz, q_len, nh, hd] - # [bsz, nh, q_len, hd] - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - # Past Key value support - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - # Flash attention codes from - # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py - - # transform the data into the format required by flash attention - qkv = torch.stack([query_states, key_states, value_states], dim=2) # [bsz, nh, 3, q_len, hd] - qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd] - # We have disabled _prepare_decoder_attention_mask in LlamaModel - # the attention_mask should be the same as the key_padding_mask - key_padding_mask = attention_mask - - if key_padding_mask is None: - qkv = rearrange(qkv, "b s ... -> (b s) ...") - max_s = q_len - cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device) - output = flash_attn_varlen_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True) - output = rearrange(output, "(b s) ... -> b s ...", b=bsz) - else: - nheads = qkv.shape[-2] - x = rearrange(qkv, "b s three h d -> b s (three h d)") - x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask) - x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads) - output_unpad = flash_attn_varlen_qkvpacked_func( - x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True - ) - output = rearrange( - pad_input(rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len), - "b s (h d) -> b s h d", - h=nheads, - ) - return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value - - -# Disable the transformation of the attention mask in LlamaModel as the flash attention -# requires the attention mask to be the same as the key_padding_mask -def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): - # [bsz, seq_len] - return attention_mask - - -def replace_attn_with_flash_attn(): - cuda_major, cuda_minor = torch.cuda.get_device_capability() - if cuda_major < 8: - print( - "Flash attention is only supported on Ampere or Hopper GPU during training due to head dim > 64 backward." - "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593" - ) - transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( - _prepare_decoder_attention_mask - ) - transformers.models.llama.modeling_llama.LlamaAttention.forward = forward - - -def unplace_flash_attn_with_attn(): - import importlib - import transformers - - print("Reloading llama model, unpatching flash attention") - importlib.reload(transformers.models.llama.modeling_llama) - - -# Adapted from https://github.com/tmm1/axolotl/blob/2eda9e02a9d15a7a3f92b41f257d9844d72fc220/src/axolotl/utils/models.py#L338 -def upcast_layer_for_flash_attention(model, torch_dtype): - # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to - # convert them back to fp16/bf16 for flash-attn compatibility. - for name, module in model.named_modules(): - if isinstance(module, LoraLayer): - module.to(torch_dtype) - if "norm" in name: - module.to(torch_dtype) - if "lm_head" in name or "embed_tokens" in name: - if hasattr(module, "weight"): - module.to(torch_dtype) - - return model diff --git a/LLAVA_Biovil/llava/train/llama_xformers_attn_monkey_patch.py b/LLAVA_Biovil/llava/train/llama_xformers_attn_monkey_patch.py deleted file mode 100644 index f8351e41ccd4a64dca237bd8f8be0702b23989dc..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/train/llama_xformers_attn_monkey_patch.py +++ /dev/null @@ -1,129 +0,0 @@ -""" -Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments -""" - -import logging -import math -from typing import Optional, Tuple - -import torch -import transformers.models.llama.modeling_llama -from torch import nn - -try: - import xformers.ops -except ImportError: - logging.error("xformers not found! Please install it before trying to use it.") - - -def replace_llama_attn_with_xformers_attn(): - transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward - - -def xformers_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # pylint: disable=duplicate-code - bsz, q_len, _ = hidden_states.size() - - query_states = ( - self.q_proj(hidden_states) - .view(bsz, q_len, self.num_heads, self.head_dim) - .transpose(1, 2) - ) - key_states = ( - self.k_proj(hidden_states) - .view(bsz, q_len, self.num_heads, self.head_dim) - .transpose(1, 2) - ) - value_states = ( - self.v_proj(hidden_states) - .view(bsz, q_len, self.num_heads, self.head_dim) - .transpose(1, 2) - ) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - ( - query_states, - key_states, - ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb( - query_states, key_states, cos, sin, position_ids - ) - # [bsz, nh, t, hd] - - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - # We only apply xformers optimizations if we don't need to output the whole attention matrix - if not output_attentions: - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros. - # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros. - if attention_mask is None or attention_mask[0, 0, 0, 1] == 0: - # input and output should be of form (bsz, q_len, num_heads, head_dim) - attn_output = xformers.ops.memory_efficient_attention( - query_states, key_states, value_states, attn_bias=None - ) - else: - # input and output should be of form (bsz, q_len, num_heads, head_dim) - attn_output = xformers.ops.memory_efficient_attention( - query_states, - key_states, - value_states, - attn_bias=xformers.ops.LowerTriangularMask(), - ) - attn_weights = None - else: - attn_weights = torch.matmul( - query_states, key_states.transpose(2, 3) - ) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - attn_weights = torch.max( - attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min) - ) - - # upcast attention to fp32 - attn_weights = nn.functional.softmax( - attn_weights, dim=-1, dtype=torch.float32 - ).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2) - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) - return attn_output, attn_weights, past_key_value diff --git a/LLAVA_Biovil/llava/train/llava_trainer.py b/LLAVA_Biovil/llava/train/llava_trainer.py deleted file mode 100644 index ab4600e75eb0c16729ac80ca495fa71b2bd03fb9..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/train/llava_trainer.py +++ /dev/null @@ -1,801 +0,0 @@ -import json -import math -import os -import shutil -import sys -import time -from distutils import dist - -import torch -from torch import nn -import numpy as np - -from torch.utils.data import Sampler -from packaging import version - -from transformers import Trainer, TrainerState, is_torch_tpu_available, is_apex_available -from transformers.debug_utils import DebugOption -from transformers.integrations import hp_params -from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint - -from transformers.trainer import ( - is_sagemaker_mp_enabled, - get_parameter_names, - has_length, - ALL_LAYERNORM_LAYERS, - ShardedDDPOption, - logger, TRAINER_STATE_NAME, -) -from typing import List, Optional - -from transformers.trainer_pt_utils import get_model_param_count -from transformers.trainer_utils import HPSearchBackend, speed_metrics, TrainOutput -from transformers.training_args import ParallelMode -from transformers.utils import is_accelerate_available - -if is_accelerate_available(): - from accelerate import Accelerator, skip_first_batches - from accelerate import __version__ as accelerate_version - from accelerate.utils import DistributedDataParallelKwargs, GradientAccumulationPlugin - - if version.parse(accelerate_version) > version.parse("0.20.3"): - from accelerate.utils import ( - load_fsdp_model, - load_fsdp_optimizer, - save_fsdp_model, - save_fsdp_optimizer, - ) - -if is_torch_tpu_available(check_device=False): - import torch_xla.core.xla_model as xm - import torch_xla.debug.metrics as met - -if is_apex_available(): - from apex import amp - -# with open('/home/guests/chantal_pellegrini/RaDialog_LLaVA/data/train_token_freqs_radrestruct_balanced_50ep.json') as f: -# token_frequencies = json.load(f) -# token_weights = {k: 1 / v for k, v in token_frequencies.items()} # linear weighting -# print("lin weighting") - -# token_weights = {k: 1 / (np.log(v) + 1) for k, v in token_frequencies.items()} # log weighting, seems to work better in this case -# print("log weighting") -token_weights = None # no weighting -print("no weighting") - -if token_weights is not None: - min_weight = min(token_weights.values()) - extra_token_weight = min_weight / 100 # 100 smaller than the smallest weight - - -def maybe_zero_3(param, ignore_status=False, name=None): - from deepspeed import zero - from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus - if hasattr(param, "ds_id"): - if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: - if not ignore_status: - print(name, 'no ignore status') - with zero.GatheredParameters([param]): - param = param.data.detach().cpu().clone() - else: - param = param.detach().cpu().clone() - return param - - -def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): - to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} - to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()} - return to_return - - -def split_to_even_chunks(indices, lengths, num_chunks): - """ - Split a list of indices into `chunks` chunks of roughly equal lengths. - """ - - if len(indices) % num_chunks != 0: - return [indices[i::num_chunks] for i in range(num_chunks)] - - num_indices_per_chunk = len(indices) // num_chunks - - chunks = [[] for _ in range(num_chunks)] - chunks_lengths = [0 for _ in range(num_chunks)] - for index in indices: - shortest_chunk = chunks_lengths.index(min(chunks_lengths)) - chunks[shortest_chunk].append(index) - chunks_lengths[shortest_chunk] += lengths[index] - if len(chunks[shortest_chunk]) == num_indices_per_chunk: - chunks_lengths[shortest_chunk] = float("inf") - - return chunks - - -def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None): - # We need to use torch for the random part as a distributed sampler will set the random seed for torch. - assert all(l != 0 for l in lengths), "Should not have zero length." - if all(l > 0 for l in lengths) or all(l < 0 for l in lengths): - # all samples are in the same modality - return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator) - mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0]) - lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0]) - - mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)] - lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)] - megabatch_size = world_size * batch_size - mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)] - lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)] - - last_mm = mm_megabatches[-1] - last_lang = lang_megabatches[-1] - additional_batch = last_mm + last_lang - megabatches = mm_megabatches[:-1] + lang_megabatches[:-1] - megabatch_indices = torch.randperm(len(megabatches), generator=generator) - megabatches = [megabatches[i] for i in megabatch_indices] - - if len(additional_batch) > 0: - megabatches.append(sorted(additional_batch)) - - return [i for megabatch in megabatches for i in megabatch] - - -def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True): - # We need to use torch for the random part as a distributed sampler will set the random seed for torch. - indices = torch.randperm(len(lengths), generator=generator) - megabatch_size = world_size * batch_size - megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)] - megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches] - megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches] - - return [i for megabatch in megabatches for batch in megabatch for i in batch] - - -class LengthGroupedSampler(Sampler): - r""" - Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while - keeping a bit of randomness. - """ - - def __init__( - self, - batch_size: int, - world_size: int, - lengths: Optional[List[int]] = None, - generator=None, - group_by_modality: bool = False, - ): - if lengths is None: - raise ValueError("Lengths must be provided.") - - self.batch_size = batch_size - self.world_size = world_size - self.lengths = lengths - self.generator = generator - self.group_by_modality = group_by_modality - - def __len__(self): - return len(self.lengths) - - def __iter__(self): - if self.group_by_modality: - indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator) - else: - indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator) - return iter(indices) - - -class LLaVATrainer(Trainer): - - def compute_loss(self, model, inputs, return_outputs=False): - """ - How the loss is computed by Trainer. By default, all models return the loss in the first element. - - Subclass and override for custom behavior. - """ - outputs = model(**inputs) - - # Save past state if it exists - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - if token_weights is not None: - # check if self has attribute vocab_weight, otherwise create - if not hasattr(self, 'vocab_weight'): - vocab = self.tokenizer.get_vocab() - self.vocab_weight = torch.ones(len(vocab)) * extra_token_weight # default weight - # map them using vocab to correct indices - for k, v in token_weights.items(): - self.vocab_weight[vocab[k]] = v - self.vocab_weight = self.vocab_weight.to(self.args.device) - - # Shift so that tokens < n predict n - shift_logits = outputs.logits[..., :-1, :].contiguous() - shift_labels = outputs.modified_labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = nn.CrossEntropyLoss(weight=self.vocab_weight) - shift_logits = shift_logits.view(-1, self.model.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) - - return (loss, outputs) if return_outputs else loss - - else: #orginial compute_loss without weighting - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - - return (loss, outputs) if return_outputs else loss - - - def _inner_training_loop( - self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None - ): - self.accelerator.free_memory() - self._train_batch_size = batch_size - logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") - # Data loader and number of training steps - train_dataloader = self.get_train_dataloader() - - # Setting up training control variables: - # number of training epochs: num_train_epochs - # number of training steps per epoch: num_update_steps_per_epoch - # total number of training steps to execute: max_steps - total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size - - len_dataloader = None - if has_length(train_dataloader): - len_dataloader = len(train_dataloader) - num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps - num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) - num_examples = self.num_examples(train_dataloader) - if args.max_steps > 0: - max_steps = args.max_steps - num_train_epochs = args.max_steps // num_update_steps_per_epoch + int( - args.max_steps % num_update_steps_per_epoch > 0 - ) - # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's - # the best we can do. - num_train_samples = args.max_steps * total_train_batch_size - else: - max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) - num_train_epochs = math.ceil(args.num_train_epochs) - num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs - elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size - max_steps = args.max_steps - # Setting a very large number of epochs so we go as many times as necessary over the iterator. - num_train_epochs = sys.maxsize - num_update_steps_per_epoch = max_steps - num_examples = total_train_batch_size * args.max_steps - num_train_samples = args.max_steps * total_train_batch_size - else: - raise ValueError( - "args.max_steps must be set to a positive value if dataloader does not have a length, was" - f" {args.max_steps}" - ) - - # Compute absolute values for logging, eval, and save if given as ratio - if args.logging_steps and args.logging_steps < 1: - args.logging_steps = math.ceil(max_steps * args.logging_steps) - if args.eval_steps and args.eval_steps < 1: - args.eval_steps = math.ceil(max_steps * args.eval_steps) - if args.save_steps and args.save_steps < 1: - args.save_steps = math.ceil(max_steps * args.save_steps) - - if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: - if self.args.n_gpu > 1: - # nn.DataParallel(model) replicates the model, creating new variables and module - # references registered here no longer work on other gpus, breaking the module - raise ValueError( - "Currently --debug underflow_overflow is not supported under DP. Please use DDP" - " (torch.distributed.launch)." - ) - else: - debug_overflow = DebugUnderflowOverflow(self.model) # noqa - - delay_optimizer_creation = ( - self.sharded_ddp is not None - and self.sharded_ddp != ShardedDDPOption.SIMPLE - or is_sagemaker_mp_enabled() - or self.fsdp is not None - ) - - # We need to reset the scheduler, as its parameters may be different on subsequent calls - if self._created_lr_scheduler: - self.lr_scheduler = None - self._created_lr_scheduler = False - - if self.is_deepspeed_enabled: - self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps) - - if not delay_optimizer_creation: - self.create_optimizer_and_scheduler(num_training_steps=max_steps) - - self.state = TrainerState() - self.state.is_hyper_param_search = trial is not None - - # Activate gradient checkpointing if needed - if args.gradient_checkpointing: - self.model.gradient_checkpointing_enable() - - model = self._wrap_model(self.model_wrapped) - - if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: - self._load_from_checkpoint(resume_from_checkpoint, model) - - # as the model is wrapped, don't use `accelerator.prepare` - # this is for unhandled cases such as - # Fairscale Sharded DDP, FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX - use_accelerator_prepare = True if model is self.model else False - - if delay_optimizer_creation: - self.create_optimizer_and_scheduler(num_training_steps=max_steps) - - # prepare using `accelerator` prepare - if use_accelerator_prepare: - self.model.train() - if hasattr(self.lr_scheduler, "step"): - if self.use_apex: - model = self.accelerator.prepare(self.model) - else: - model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) - else: - # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config. - model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( - self.model, self.optimizer, self.lr_scheduler - ) - - if self.is_fsdp_enabled: - self.model = model - - # for the rest of this function `model` is the outside model, whether it was wrapped or not - if model is not self.model: - self.model_wrapped = model - - # backward compatibility - if self.is_deepspeed_enabled: - self.deepspeed = self.model_wrapped - - # deepspeed ckpt loading - if resume_from_checkpoint is not None and self.is_deepspeed_enabled: - print(f"DeepSpeed info: Loading model from {resume_from_checkpoint}") - deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint) - # get step from opt state - # Assuming `optimizer_state_dict` is the dictionary you've loaded from the checkpoint - for param_tensor, state in self.lr_scheduler.optimizer.state.items(): - step_tensor = state['step'] - step_value = step_tensor.item() # Convert tensor to a Python number - print(f"Step value for a parameter tensor: {step_value}") - # Since all parameters should have been updated the same number of times, - # you can break after the first iteration - break - # step scheduler to match - for _ in range(int(step_value)): - self.lr_scheduler.step() - # Check if saved optimizer or scheduler states exist - self._load_optimizer_and_scheduler(resume_from_checkpoint) - - # important: at this point: - # self.model is the Transformers Model - # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc. - - # Train! - logger.info("***** Running training *****") - logger.info(f" Num examples = {num_examples:,}") - logger.info(f" Num Epochs = {num_train_epochs:,}") - logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}") - if self.args.per_device_train_batch_size != self._train_batch_size: - logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}") - logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") - logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") - logger.info(f" Total optimization steps = {max_steps:,}") - logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") - - self.state.epoch = 0 - start_time = time.time() - epochs_trained = 0 - steps_trained_in_current_epoch = 0 - steps_trained_progress_bar = None - - # Check if continuing training from a checkpoint - if resume_from_checkpoint is not None and os.path.isfile( - os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME) - ): - self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) - epochs_trained = self.state.global_step // num_update_steps_per_epoch - if not args.ignore_data_skip: - steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) - steps_trained_in_current_epoch *= args.gradient_accumulation_steps - else: - steps_trained_in_current_epoch = 0 - - logger.info(" Continuing training from checkpoint, will skip to saved global_step") - logger.info(f" Continuing training from epoch {epochs_trained}") - logger.info(f" Continuing training from global step {self.state.global_step}") - if not args.ignore_data_skip: - logger.info( - f" Will skip the first {epochs_trained} epochs then the first" - f" {steps_trained_in_current_epoch} batches in the first epoch." - ) - - # Update the references - self.callback_handler.model = self.model - self.callback_handler.optimizer = self.optimizer - self.callback_handler.lr_scheduler = self.lr_scheduler - self.callback_handler.train_dataloader = train_dataloader - if self.hp_name is not None and self._trial is not None: - # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial - # parameter to Train when using DDP. - self.state.trial_name = self.hp_name(self._trial) - if trial is not None: - assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial - self.state.trial_params = hp_params(assignments) - else: - self.state.trial_params = None - # This should be the same if the state has been saved but in case the training arguments changed, it's safer - # to set this after the load. - self.state.max_steps = max_steps - self.state.num_train_epochs = num_train_epochs - self.state.is_local_process_zero = self.is_local_process_zero() - self.state.is_world_process_zero = self.is_world_process_zero() - - # tr_loss is a tensor to avoid synchronization of TPUs through .item() - tr_loss = torch.tensor(0.0).to(args.device) - # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses - self._total_loss_scalar = 0.0 - self._globalstep_last_logged = self.state.global_step - model.zero_grad() - - self.control = self.callback_handler.on_train_begin(args, self.state, self.control) - - # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. - if not args.ignore_data_skip: - for epoch in range(epochs_trained): - for _ in train_dataloader: - break - - total_batched_samples = 0 - for epoch in range(epochs_trained, num_train_epochs): - epoch_iterator = train_dataloader - - # Reset the past mems state at the beginning of each epoch if necessary. - if args.past_index >= 0: - self._past = None - - steps_in_epoch = ( - len(epoch_iterator) - if len_dataloader is not None - else args.max_steps * args.gradient_accumulation_steps - ) - self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) - - if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: - self._load_rng_state(resume_from_checkpoint) - - rng_to_sync = False - steps_skipped = 0 - if steps_trained_in_current_epoch > 0: - epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) - steps_skipped = steps_trained_in_current_epoch - steps_trained_in_current_epoch = 0 - rng_to_sync = True - - step = -1 - for step, inputs in enumerate(epoch_iterator): - total_batched_samples += 1 - if rng_to_sync: - self._load_rng_state(resume_from_checkpoint) - rng_to_sync = False - - # Skip past any already trained steps if resuming training - if steps_trained_in_current_epoch > 0: - steps_trained_in_current_epoch -= 1 - if steps_trained_progress_bar is not None: - steps_trained_progress_bar.update(1) - if steps_trained_in_current_epoch == 0: - self._load_rng_state(resume_from_checkpoint) - continue - elif steps_trained_progress_bar is not None: - steps_trained_progress_bar.close() - steps_trained_progress_bar = None - - if step % args.gradient_accumulation_steps == 0: - self.control = self.callback_handler.on_step_begin(args, self.state, self.control) - - with self.accelerator.accumulate(model): - tr_loss_step = self.training_step(model, inputs) - - if ( - args.logging_nan_inf_filter - and not is_torch_tpu_available() - and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) - ): - # if loss is nan or inf simply add the average of previous logged losses - tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) - else: - tr_loss += tr_loss_step - - self.current_flos += float(self.floating_point_ops(inputs)) - - is_last_step_and_steps_less_than_grad_acc = ( - steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch - ) - - if ( - total_batched_samples % args.gradient_accumulation_steps == 0 - or - # last step in epoch but step is always smaller than gradient_accumulation_steps - is_last_step_and_steps_less_than_grad_acc - ): - # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered - # in accelerate. So, explicitly enable sync gradients to True in that case. - if is_last_step_and_steps_less_than_grad_acc or ( - version.parse(accelerate_version) <= version.parse("0.20.3") - ): - self.accelerator.gradient_state._set_sync_gradients(True) - - # Gradient clipping - if args.max_grad_norm is not None and args.max_grad_norm > 0: - # deepspeed does its own clipping - - if self.do_grad_scaling: - # Reduce gradients first for XLA - if is_torch_tpu_available(): - gradients = xm._fetch_gradients(self.optimizer) - xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size()) - # AMP: gradients need unscaling - self.scaler.unscale_(self.optimizer) - - if is_sagemaker_mp_enabled() and args.fp16: - self.optimizer.clip_master_grads(args.max_grad_norm) - elif hasattr(self.optimizer, "clip_grad_norm"): - # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping - self.optimizer.clip_grad_norm(args.max_grad_norm) - elif hasattr(model, "clip_grad_norm_"): - # Some models (like FullyShardedDDP) have a specific way to do gradient clipping - model.clip_grad_norm_(args.max_grad_norm) - elif self.use_apex: - # Revert to normal clipping otherwise, handling Apex or full precision - nn.utils.clip_grad_norm_( - amp.master_params(self.optimizer), - args.max_grad_norm, - ) - else: - self.accelerator.clip_grad_norm_( - model.parameters(), - args.max_grad_norm, - ) - - # Optimizer step - optimizer_was_run = True - if is_torch_tpu_available(): - if self.do_grad_scaling: - self.scaler.step(self.optimizer) - self.scaler.update() - else: - # tpu-comment: accelerate wrapped optimizers call xm.optimizer_step - self.optimizer.step() - elif self.do_grad_scaling: - scale_before = self.scaler.get_scale() - self.scaler.step(self.optimizer) - self.scaler.update() - scale_after = self.scaler.get_scale() - optimizer_was_run = scale_before <= scale_after - else: - self.optimizer.step() - optimizer_was_run = not self.accelerator.optimizer_step_was_skipped - - if optimizer_was_run: - # Delay optimizer scheduling until metrics are generated - if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): - self.lr_scheduler.step() - - model.zero_grad() - self.state.global_step += 1 - self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch - self.control = self.callback_handler.on_step_end(args, self.state, self.control) - - self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) - else: - self.control = self.callback_handler.on_substep_end(args, self.state, self.control) - - if self.control.should_epoch_stop or self.control.should_training_stop: - break - if step < 0: - logger.warning( - "There seems to be not a single sample in your epoch_iterator, stopping training at step" - f" {self.state.global_step}! This is expected if you're using an IterableDataset and set" - f" num_steps ({max_steps}) higher than the number of available samples." - ) - self.control.should_training_stop = True - - self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) - self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) - - if DebugOption.TPU_METRICS_DEBUG in self.args.debug: - if is_torch_tpu_available(): - # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) - xm.master_print(met.metrics_report()) - else: - logger.warning( - "You enabled PyTorch/XLA debug metrics but you don't have a TPU " - "configured. Check your training configuration if this is unexpected." - ) - if self.control.should_training_stop: - break - - if args.past_index and hasattr(self, "_past"): - # Clean the state at the end of training - delattr(self, "_past") - - logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") - if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: - # Wait for everyone to get here so we are sur the model has been saved by process 0. - if is_torch_tpu_available(): - xm.rendezvous("load_best_model_at_end") - elif args.parallel_mode == ParallelMode.DISTRIBUTED: - dist.barrier() - # elif is_sagemaker_mp_enabled(): - # smp.barrier() - - self._load_best_model() - - # add remaining tr_loss - self._total_loss_scalar += tr_loss.item() - train_loss = self._total_loss_scalar / self.state.global_step - - metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps) - self.store_flos() - metrics["total_flos"] = self.state.total_flos - metrics["train_loss"] = train_loss - - self.is_in_train = False - - self._memory_tracker.stop_and_update_metrics(metrics) - - self.log(metrics) - - run_dir = self._get_output_dir(trial) - checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) - - # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. - if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: - for checkpoint in checkpoints_sorted: - if checkpoint != self.state.best_model_checkpoint: - logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") - shutil.rmtree(checkpoint) - - self.control = self.callback_handler.on_train_end(args, self.state, self.control) - - return TrainOutput(self.state.global_step, train_loss, metrics) - - def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: - if self.train_dataset is None or not has_length(self.train_dataset): - return None - - if self.args.group_by_modality_length: - lengths = self.train_dataset.modality_lengths - return LengthGroupedSampler( - self.args.train_batch_size, - world_size=self.args.world_size * self.args.gradient_accumulation_steps, - lengths=lengths, - group_by_modality=True, - ) - else: - return super()._get_train_sampler() - - def create_optimizer(self): - """ - Setup the optimizer. - - We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the - Trainer's init through `optimizers`, or subclass and override this method in a subclass. - """ - if is_sagemaker_mp_enabled(): - return super().create_optimizer() - if self.sharded_ddp == ShardedDDPOption.SIMPLE: - return super().create_optimizer() - - opt_model = self.model - - if self.optimizer is None: - decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS) - decay_parameters = [name for name in decay_parameters if "bias" not in name] - if self.args.mm_projector_lr is not None: - projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name] - optimizer_grouped_parameters = [ - { - "params": [ - p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad) - ], - "weight_decay": self.args.weight_decay, - }, - { - "params": [ - p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad) - ], - "weight_decay": 0.0, - }, - { - "params": [ - p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad) - ], - "weight_decay": self.args.weight_decay, - "lr": self.args.mm_projector_lr, - }, - { - "params": [ - p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad) - ], - "weight_decay": 0.0, - "lr": self.args.mm_projector_lr, - }, - ] - else: - optimizer_grouped_parameters = [ - { - "params": [ - p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad) - ], - "weight_decay": self.args.weight_decay, - }, - { - "params": [ - p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad) - ], - "weight_decay": 0.0, - }, - ] - - optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) - - if self.sharded_ddp == ShardedDDPOption.SIMPLE: - self.optimizer = OSS( - params=optimizer_grouped_parameters, - optim=optimizer_cls, - **optimizer_kwargs, - ) - else: - self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) - if optimizer_cls.__name__ == "Adam8bit": - import bitsandbytes - - manager = bitsandbytes.optim.GlobalOptimManager.get_instance() - - skipped = 0 - for module in opt_model.modules(): - if isinstance(module, nn.Embedding): - skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values()) - logger.info(f"skipped {module}: {skipped/2**20}M params") - manager.register_module_override(module, "weight", {"optim_bits": 32}) - logger.debug(f"bitsandbytes: will optimize {module} in fp32") - logger.info(f"skipped: {skipped/2**20}M params") - - return self.optimizer - - def _save_checkpoint(self, model, trial, metrics=None): - if getattr(self.args, 'tune_mm_mlp_adapter', False): - from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR - checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" - - run_dir = self._get_output_dir(trial=trial) - output_dir = os.path.join(run_dir, checkpoint_folder) - - # Only save Adapter - keys_to_match = ['mm_projector', 'vision_resampler'] - if getattr(self.args, "use_im_start_end", False): - keys_to_match.extend(['embed_tokens', 'embed_in']) - - weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match) - - if self.args.local_rank == 0 or self.args.local_rank == -1: - self.model.config.save_pretrained(output_dir) - torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin')) - else: - super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics) - - def _save(self, output_dir: Optional[str] = None, state_dict=None): - if getattr(self.args, 'tune_mm_mlp_adapter', False): - pass - else: - super(LLaVATrainer, self)._save(output_dir, state_dict) diff --git a/LLAVA_Biovil/llava/train/train.py b/LLAVA_Biovil/llava/train/train.py deleted file mode 100644 index 5543199cdc85f0a472b3df858d43ba774f23ca30..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/train/train.py +++ /dev/null @@ -1,1298 +0,0 @@ -# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: -# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: -# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math -import os -import random - -os.environ["WANDB_PROJECT"] = 'radiolog_llava' -import copy -from dataclasses import dataclass, field -import json -import logging -import pathlib -from typing import Dict, Optional, Sequence, List, Tuple - -import torch -from torch import Tensor -from skimage import io -import transformers -from torchvision.transforms import Compose, Resize, ToTensor, CenterCrop, transforms -from torchvision.transforms import functional as F, InterpolationMode - -from llava.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from torch.utils.data import Dataset -from llava.train.llava_trainer import LLaVATrainer - -from llava import conversation as conversation_lib -from llava.model import * -from llava.mm_utils import tokenizer_image_token - -from PIL import Image -import numpy as np - -local_rank = None - -# if LLAVA_MED -# IGNORE_INDEX = -100 -# DEFAULT_PAD_TOKEN = "[PAD]" -# DEFAULT_EOS_TOKEN = "" -# DEFAULT_BOS_TOKEN = "" -# DEFAULT_UNK_TOKEN = "" -# DEFAULT_IMAGE_TOKEN = "" -# DEFAULT_IMAGE_PATCH_TOKEN = "" -# DEFAULT_IM_START_TOKEN = "" -# DEFAULT_IM_END_TOKEN = "" - -def rank0_print(*args): - if local_rank == 0: - print(*args) - - -@dataclass -class ModelArguments: - model_name_or_path: Optional[str] = field(default="facebook/opt-125m") - version: Optional[str] = field(default="v0") - freeze_backbone: bool = field(default=False) - tune_mm_mlp_adapter: bool = field(default=False) - vision_tower: Optional[str] = field(default=None) - mm_vision_select_layer: Optional[int] = field(default=-1) # default to the last layer - pretrain_mm_mlp_adapter: Optional[str] = field(default=None) - mm_projector_type: Optional[str] = field(default='linear') - mm_use_im_start_end: bool = field(default=False) - mm_use_im_patch_token: bool = field(default=True) - mm_vision_select_feature: Optional[str] = field(default="patch") - mv_type: Optional[str] = field(default='concat') - - -@dataclass -class DataArguments: - data_path: str = field(default=None, - metadata={"help": "Path to the training data."}) - lazy_preprocess: bool = False - is_multimodal: bool = False - image_folder: Optional[str] = field(default=None) - image_aspect_ratio: str = 'square' - do_augment: bool = field(default=False) - do_img_order_augment: bool = field(default=False) - -@dataclass -class TrainingArguments(transformers.TrainingArguments): - cache_dir: Optional[str] = field(default=None) - optim: str = field(default="adamw_torch") - remove_unused_columns: bool = field(default=False) - freeze_mm_mlp_adapter: bool = field(default=False) - unfreeze_n_vision_tower_layers: Optional[int] = field(default=None) - mpt_attn_impl: Optional[str] = field(default="triton") - model_max_length: int = field( - default=512, - metadata={ - "help": - "Maximum sequence length. Sequences will be right padded (and possibly truncated)." - }, - ) - double_quant: bool = field( - default=True, - metadata={"help": "Compress the quantization statistics through double quantization."} - ) - quant_type: str = field( - default="nf4", - metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} - ) - bits: int = field( - default=16, - metadata={"help": "How many bits to use."} - ) - lora_enable: bool = False - lora_r: int = 64 - lora_alpha: int = 16 - lora_dropout: float = 0.05 - lora_weight_path: str = "" - lora_bias: str = "none" - mm_projector_lr: Optional[float] = None - group_by_modality_length: bool = field(default=False) - - -def maybe_zero_3(param, ignore_status=False, name=None): - from deepspeed import zero - from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus - if hasattr(param, "ds_id"): - if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: - if not ignore_status: - logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}") - with zero.GatheredParameters([param]): - param = param.data.detach().cpu().clone() - else: - param = param.detach().cpu().clone() - return param - - -# Borrowed from peft.utils.get_peft_model_state_dict -def get_peft_state_maybe_zero_3(named_params, bias): - if bias == "none": - to_return = {k: t for k, t in named_params if "lora_" in k} - elif bias == "all": - to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} - elif bias == "lora_only": - to_return = {} - maybe_lora_bias = {} - lora_bias_names = set() - for k, t in named_params: - if "lora_" in k: - to_return[k] = t - bias_name = k.split("lora_")[0] + "bias" - lora_bias_names.add(bias_name) - elif "bias" in k: - maybe_lora_bias[k] = t - for k, t in maybe_lora_bias: - if bias_name in lora_bias_names: - to_return[bias_name] = t - else: - raise NotImplementedError - to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()} - return to_return - - -def get_peft_state_non_lora_maybe_zero_3_extended(model, require_grad_only=True): - named_entities = list(model.named_parameters()) + list(model.named_buffers()) - to_return = {k: v for k, v in named_entities if "lora_" not in k} - if require_grad_only: - # For buffers, requires_grad attribute does not apply, so they should be included regardless - to_return = {k: v for k, v in to_return.items() if type(v) == torch.Tensor or v.requires_grad} - to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} - return to_return - - -def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): - to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} - to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} - return to_return - - -def find_all_linear_names(model): - cls = torch.nn.Linear - lora_module_names = set() - multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler', 'image_pooler'] - for name, module in model.named_modules(): - if any(mm_keyword in name for mm_keyword in multimodal_keywords): - continue - if isinstance(module, cls): - names = name.split('.') - lora_module_names.add(names[0] if len(names) == 1 else names[-1]) - - if 'lm_head' in lora_module_names: # needed for 16-bit - lora_module_names.remove('lm_head') - return list(lora_module_names) - - -def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, - output_dir: str): - """Collects the state dict and dump to disk.""" - - if getattr(trainer.args, "tune_mm_mlp_adapter", False): - # Only save Adapter - keys_to_match = ['mm_projector'] - if getattr(trainer.args, "use_im_start_end", False): - keys_to_match.extend(['embed_tokens', 'embed_in']) - - weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match) - trainer.model.config.save_pretrained(output_dir) - - current_folder = output_dir.split('/')[-1] - parent_folder = os.path.dirname(output_dir) - if trainer.args.local_rank == 0 or trainer.args.local_rank == -1: - if current_folder.startswith('checkpoint-'): - mm_projector_folder = os.path.join(parent_folder, "mm_projector") - os.makedirs(mm_projector_folder, exist_ok=True) - torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin')) - else: - torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin')) - return - - if trainer.deepspeed: - torch.cuda.synchronize() - trainer.save_model(output_dir) - return - - state_dict = trainer.model.state_dict() - if trainer.args.should_save: - cpu_state_dict = { - key: value.cpu() - for key, value in state_dict.items() - } - del state_dict - trainer._save(output_dir, state_dict=cpu_state_dict) # noqa - - -def smart_tokenizer_and_embedding_resize( - special_tokens_dict: Dict, - tokenizer: transformers.PreTrainedTokenizer, - model: transformers.PreTrainedModel, -): - """Resize tokenizer and embedding. - - Note: This is the unoptimized version that may make your embedding size not be divisible by 64. - """ - num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) - model.resize_token_embeddings(len(tokenizer)) - - if num_new_tokens > 0: - input_embeddings = model.get_input_embeddings().weight.data - output_embeddings = model.get_output_embeddings().weight.data - - input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( - dim=0, keepdim=True) - output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( - dim=0, keepdim=True) - - input_embeddings[-num_new_tokens:] = input_embeddings_avg - output_embeddings[-num_new_tokens:] = output_embeddings_avg - - -def _tokenize_fn(strings: Sequence[str], - tokenizer: transformers.PreTrainedTokenizer) -> Dict: - """Tokenize a list of strings.""" - tokenized_list = [ - tokenizer( - text, - return_tensors="pt", - padding="longest", - max_length=tokenizer.model_max_length, - truncation=True, - ) for text in strings - ] - input_ids = labels = [ - tokenized.input_ids[0] for tokenized in tokenized_list - ] - input_ids_lens = labels_lens = [ - tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() - for tokenized in tokenized_list - ] - return dict( - input_ids=input_ids, - labels=labels, - input_ids_lens=input_ids_lens, - labels_lens=labels_lens, - ) - - -def _mask_targets(target, tokenized_lens, speakers): - # cur_idx = 0 - cur_idx = tokenized_lens[0] - tokenized_lens = tokenized_lens[1:] - target[:cur_idx] = IGNORE_INDEX - for tokenized_len, speaker in zip(tokenized_lens, speakers): - if speaker == "human": - target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX - cur_idx += tokenized_len - - -def _add_speaker_and_signal(header, source, get_conversation=True): - """Add speaker and start/end signal on each round.""" - BEGIN_SIGNAL = "### " - END_SIGNAL = "\n" - conversation = header - for sentence in source: - from_str = sentence["from"] - if from_str.lower() == "human": - from_str = conversation_lib.default_conversation.roles[0] - elif from_str.lower() == "gpt": - from_str = conversation_lib.default_conversation.roles[1] - else: - from_str = 'unknown' - sentence["value"] = (BEGIN_SIGNAL + from_str + ": " + - sentence["value"] + END_SIGNAL) - if get_conversation: - conversation += sentence["value"] - conversation += BEGIN_SIGNAL - return conversation - - -def preprocess_multimodal( - sources: Sequence[str], - data_args: DataArguments -) -> Dict: - is_multimodal = data_args.is_multimodal - if not is_multimodal: - return sources - - for source in sources: - for sentence in source: - if DEFAULT_IMAGE_TOKEN in sentence['value']: - sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip() - sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value'] - sentence['value'] = sentence['value'].strip() - if "mmtag" in conversation_lib.default_conversation.version: - sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '' + DEFAULT_IMAGE_TOKEN + '') - replace_token = DEFAULT_IMAGE_TOKEN - if data_args.mm_use_im_start_end: - replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN - sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token) - - return sources - - -def preprocess_llama_2( - sources, - tokenizer: transformers.PreTrainedTokenizer, - has_image: bool = False -) -> Dict: - conv = conversation_lib.default_conversation.copy() - roles = {"human": conv.roles[0], "gpt": conv.roles[1]} - - # Apply prompt templates - conversations = [] - for i, source in enumerate(sources): - if roles[source[0]["from"]] != conv.roles[0]: - # Skip the first one if it is not from human - source = source[1:] - - conv.messages = [] - for j, sentence in enumerate(source): - role = roles[sentence["from"]] - assert role == conv.roles[j % 2], f"{i}" - conv.append_message(role, sentence["value"]) - conversations.append(conv.get_prompt()) - - # Tokenize conversations - - if has_image: - input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) - else: - input_ids = tokenizer( - conversations, - return_tensors="pt", - padding="longest", - max_length=tokenizer.model_max_length, - truncation=True, - ).input_ids - - targets = input_ids.clone() - - assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2 - - # Mask targets - sep = "[/INST] " - for conversation, target in zip(conversations, targets): - total_len = int(target.ne(tokenizer.pad_token_id).sum()) - - rounds = conversation.split(conv.sep2) - cur_len = 1 - target[:cur_len] = IGNORE_INDEX - for i, rou in enumerate(rounds): - if rou == "": - break - - parts = rou.split(sep) - if len(parts) != 2: - break - parts[0] += sep - - if has_image: - round_len = len(tokenizer_image_token(rou, tokenizer)) - instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2 - else: - round_len = len(tokenizer(rou).input_ids) - instruction_len = len(tokenizer(parts[0]).input_ids) - 2 - - target[cur_len : cur_len + instruction_len] = IGNORE_INDEX - - cur_len += round_len - target[cur_len:] = IGNORE_INDEX - - if cur_len < tokenizer.model_max_length: - if cur_len != total_len: - target[:] = IGNORE_INDEX - print( - f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." - f" (ignored)" - ) - - return dict( - input_ids=input_ids, - labels=targets, - ) - - -def preprocess_v1( - sources, - tokenizer: transformers.PreTrainedTokenizer, - has_image: bool = False -) -> Dict: - conv = conversation_lib.default_conversation.copy() - roles = {"human": conv.roles[0], "gpt": conv.roles[1]} - - # Apply prompt templates - conversations = [] - for i, source in enumerate(sources): - if roles[source[0]["from"]] != conv.roles[0]: - # Skip the first one if it is not from human - source = source[1:] - - conv.messages = [] - for j, sentence in enumerate(source): - role = roles[sentence["from"]] - assert role == conv.roles[j % 2], f"{i}" - conv.append_message(role, sentence["value"]) - conversations.append(conv.get_prompt()) - - # Tokenize conversations - - if has_image: - input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) - else: - input_ids = tokenizer( - conversations, - return_tensors="pt", - padding="longest", - max_length=tokenizer.model_max_length, - truncation=True, - ).input_ids - - targets = input_ids.clone() - - assert conv.sep_style == conversation_lib.SeparatorStyle.TWO - - # Mask targets - sep = conv.sep + conv.roles[1] + ": " - for conversation, target in zip(conversations, targets): - total_len = int(target.ne(tokenizer.pad_token_id).sum()) - - rounds = conversation.split(conv.sep2) - cur_len = 1 - target[:cur_len] = IGNORE_INDEX - for i, rou in enumerate(rounds): - if rou == "": - break - - parts = rou.split(sep) - if len(parts) != 2: - break - parts[0] += sep - - if has_image: - round_len = len(tokenizer_image_token(rou, tokenizer)) - instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2 - else: - round_len = len(tokenizer(rou).input_ids) - instruction_len = len(tokenizer(parts[0]).input_ids) - 2 - - if i == len(rounds) - 2: #last round, keep answer for training - target[cur_len : cur_len + instruction_len] = IGNORE_INDEX - else: - target[cur_len : cur_len + round_len] = IGNORE_INDEX #previous rounds - mask everything - - cur_len += round_len - target[cur_len:] = IGNORE_INDEX - - if cur_len < tokenizer.model_max_length: - if cur_len != total_len: - target[:] = IGNORE_INDEX - print( - f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." - f" (ignored)" - ) - - return dict( - input_ids=input_ids, - labels=targets, - ) - - -def preprocess_mpt( - sources, - tokenizer: transformers.PreTrainedTokenizer, -) -> Dict: - conv = conversation_lib.default_conversation.copy() - roles = {"human": conv.roles[0], "gpt": conv.roles[1]} - - # Apply prompt templates - conversations = [] - for i, source in enumerate(sources): - if roles[source[0]["from"]] != conv.roles[0]: - # Skip the first one if it is not from human - source = source[1:] - - conv.messages = [] - for j, sentence in enumerate(source): - role = roles[sentence["from"]] - assert role == conv.roles[j % 2], f"{i}" - conv.append_message(role, sentence["value"]) - conversations.append(conv.get_prompt()) - - # Tokenize conversations - input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) - targets = input_ids.clone() - assert conv.sep_style == conversation_lib.SeparatorStyle.MPT - - # Mask targets - sep = conv.sep + conv.roles[1] - for conversation, target in zip(conversations, targets): - total_len = int(target.ne(tokenizer.pad_token_id).sum()) - - rounds = conversation.split(conv.sep) - re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt - for conv_idx in range(3, len(rounds), 2): - re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2])) # user + gpt - cur_len = 0 - target[:cur_len] = IGNORE_INDEX - for i, rou in enumerate(re_rounds): - if rou == "": - break - - parts = rou.split(sep) - if len(parts) != 2: - break - parts[0] += sep - round_len = len(tokenizer_image_token(rou, tokenizer)) + len(tokenizer_image_token(conv.sep, tokenizer)) - instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - target[cur_len : cur_len + instruction_len] = IGNORE_INDEX - - cur_len += round_len - target[cur_len:] = IGNORE_INDEX - - if cur_len < tokenizer.model_max_length: - if cur_len != total_len: - target[:] = IGNORE_INDEX - print( - f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." - f" (ignored)" - ) - - return dict( - input_ids=input_ids, - labels=targets, - ) - - -def preprocess_plain( - sources: Sequence[str], - tokenizer: transformers.PreTrainedTokenizer, -) -> Dict: - # add end signal and concatenate together - conversations = [] - for source in sources: - assert len(source) == 2 - assert DEFAULT_IMAGE_TOKEN in source[0]['value'] - source[0]['value'] = DEFAULT_IMAGE_TOKEN - conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep - conversations.append(conversation) - # tokenize conversations - input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] - targets = copy.deepcopy(input_ids) - for target, source in zip(targets, sources): - tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer)) - target[:tokenized_len] = IGNORE_INDEX - - return dict(input_ids=input_ids, labels=targets) - - -def preprocess( - sources: Sequence[str], - tokenizer: transformers.PreTrainedTokenizer, - has_image: bool = False -) -> Dict: - """ - Given a list of sources, each is a conversation list. This transform: - 1. Add signal '### ' at the beginning each sentence, with end signal '\n'; - 2. Concatenate conversations together; - 3. Tokenize the concatenated conversation; - 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX. - """ - if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN: - return preprocess_plain(sources, tokenizer) - if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2: - return preprocess_llama_2(sources, tokenizer, has_image=has_image) - if conversation_lib.default_conversation.version.startswith("v1"): - return preprocess_v1(sources, tokenizer, has_image=has_image) - if conversation_lib.default_conversation.version == "mpt": - return preprocess_mpt(sources, tokenizer) - # add end signal and concatenate together - conversations = [] - for source in sources: - header = f"{conversation_lib.default_conversation.system}\n\n" - conversation = _add_speaker_and_signal(header, source) - conversations.append(conversation) - # tokenize conversations - def get_tokenize_len(prompts): - return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts] - - if has_image: - input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] - else: - conversations_tokenized = _tokenize_fn(conversations, tokenizer) - input_ids = conversations_tokenized["input_ids"] - - targets = copy.deepcopy(input_ids) - for target, source in zip(targets, sources): - if has_image: - tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source]) - else: - tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"] - speakers = [sentence["from"] for sentence in source] - _mask_targets(target, tokenized_lens, speakers) - - return dict(input_ids=input_ids, labels=targets) - -class ExpandChannels: - """ - Transforms an image with one channel to an image with three channels by copying - pixel intensities of the image along the 1st dimension. - """ - - def __call__(self, data: torch.Tensor) -> torch.Tensor: - """ - :param data: Tensor of shape [1, H, W]. - :return: Tensor with channel copied three times, shape [3, H, W]. - """ - if data.shape[0] != 1: - raise ValueError(f"Expected input of shape [1, H, W], found {data.shape}") - return torch.repeat_interleave(data, 3, dim=0) - - -def _apply_op(img: Tensor, op_name: str, magnitude: float, - interpolation: InterpolationMode, fill: Optional[List[float]]): - if op_name == "ShearX": - img = F.affine(img, angle=0.0, translate=[0, 0], scale=1.0, shear=[math.degrees(magnitude), 0.0], - interpolation=interpolation, fill=fill) - elif op_name == "ShearY": - img = F.affine(img, angle=0.0, translate=[0, 0], scale=1.0, shear=[0.0, math.degrees(magnitude)], - interpolation=interpolation, fill=fill) - elif op_name == "TranslateX": - img = F.affine(img, angle=0.0, translate=[int(magnitude), 0], scale=1.0, - interpolation=interpolation, shear=[0.0, 0.0], fill=fill) - elif op_name == "TranslateY": - img = F.affine(img, angle=0.0, translate=[0, int(magnitude)], scale=1.0, - interpolation=interpolation, shear=[0.0, 0.0], fill=fill) - elif op_name == "Rotate": - img = F.rotate(img, magnitude, interpolation=interpolation, fill=fill) - elif op_name == "Brightness": - img = F.adjust_brightness(img, 1.0 + magnitude) - elif op_name == "Color": - img = F.adjust_saturation(img, 1.0 + magnitude) - elif op_name == "Contrast": - img = F.adjust_contrast(img, 1.0 + magnitude) - elif op_name == "Sharpness": - img = F.adjust_sharpness(img, 1.0 + magnitude) - elif op_name == "Posterize": - img = F.posterize(img, int(magnitude)) - elif op_name == "Solarize": - img = F.solarize(img, magnitude) - elif op_name == "AutoContrast": - img = F.autocontrast(img) - elif op_name == "Equalize": - img = F.equalize(img) - elif op_name == "Invert": - img = F.invert(img) - elif op_name == "Identity": - pass - else: - raise ValueError("The provided operator {} is not recognized.".format(op_name)) - return img - -class TrivialAugmentWide(torch.nn.Module): - r"""Dataset-independent data-augmentation with TrivialAugment Wide, as described in - `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" `. - If the image is torch Tensor, it should be of type torch.uint8, and it is expected - to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. - If img is PIL Image, it is expected to be in mode "L" or "RGB". - Args: - num_magnitude_bins (int): The number of different magnitude values. - interpolation (InterpolationMode): Desired interpolation enum defined by - :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. - If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. - fill (sequence or number, optional): Pixel fill value for the area outside the transformed - image. If given a number, the value is used for all bands respectively. - """ - - def __init__(self, num_magnitude_bins: int = 31, interpolation: InterpolationMode = InterpolationMode.NEAREST, - fill: Optional[List[float]] = None, strength: float = 1.0) -> None: - super().__init__() - self.num_magnitude_bins = num_magnitude_bins - self.interpolation = interpolation - self.fill = fill - self.strength = max(0.0, min(strength, 1.0)) # Ensuring strength is within [0, 1] - - def _augmentation_space(self, num_bins: int) -> Dict[str, Tuple[Tensor, bool]]: - scale_factor = self.strength - return { - "Identity": (torch.tensor(0.0), False), - "ShearX": (torch.linspace(0.0, 0.99 * scale_factor, num_bins), True), - "ShearY": (torch.linspace(0.0, 0.99 * scale_factor, num_bins), True), - "TranslateX": (torch.linspace(0.0, 32.0 * scale_factor, num_bins), True), - "TranslateY": (torch.linspace(0.0, 32.0 * scale_factor, num_bins), True), - "Rotate": (torch.linspace(0.0, 135.0 * scale_factor, num_bins), True), - "Brightness": (torch.linspace(0.0, 0.99 * scale_factor, num_bins), True), - "Color": (torch.linspace(0.0, 0.99 * scale_factor, num_bins), True), - "Contrast": (torch.linspace(0.0, 0.99 * scale_factor, num_bins), True), - "Sharpness": (torch.linspace(0.0, 0.99 * scale_factor, num_bins), True), - #"Posterize": (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6)).round().int(), False), - "Solarize": (torch.linspace(256.0, 0.0, num_bins), False), - "AutoContrast": (torch.tensor(0.0), False), - } - - def forward(self, img: Tensor) -> Tensor: - """ - img (PIL Image or Tensor): Image to be transformed. - Returns: - PIL Image or Tensor: Transformed image. - """ - fill = self.fill - if isinstance(img, Tensor): - if isinstance(fill, (int, float)): - fill = [float(fill)] * F.get_image_num_channels(img) - elif fill is not None: - fill = [float(f) for f in fill] - - op_meta = self._augmentation_space(self.num_magnitude_bins) - op_index = int(torch.randint(len(op_meta), (1,)).item()) - op_name = list(op_meta.keys())[op_index] - magnitudes, signed = op_meta[op_name] - magnitude = float(magnitudes[torch.randint(len(magnitudes), (1,), dtype=torch.long)].item()) \ - if magnitudes.ndim > 0 else 0.0 - if signed and torch.randint(2, (1,)): - magnitude *= -1.0 - - return _apply_op(img, op_name, magnitude, interpolation=self.interpolation, fill=fill) - - def __repr__(self) -> str: - s = self.__class__.__name__ + '(' - s += 'num_magnitude_bins={num_magnitude_bins}' - s += ', interpolation={interpolation}' - s += ', fill={fill}' - s += ')' - return s.format(**self.__dict__) - -class LazySupervisedDataset(Dataset): - """Dataset for supervised fine-tuning.""" - - def __init__(self, data_path: str, - tokenizer: transformers.PreTrainedTokenizer, - data_args: DataArguments): - super(LazySupervisedDataset, self).__init__() - list_data_dict = json.load(open(data_path, "r")) - - rank0_print("Formatting inputs...Skip in lazy mode") - self.tokenizer = tokenizer - self.list_data_dict = list_data_dict - self.data_args = data_args - self.do_img_order_augment = self.data_args.do_img_order_augment - - self.vis_transforms_biovil = self.create_chest_xray_transform_for_inference(512, center_crop_size=448) - - if self.data_args.do_augment: - self.augment = TrivialAugmentWide(strength=0.5) #0.2 weak, 0.5 strong - else: - self.augment = None - - def __len__(self): - return len(self.list_data_dict) - - def create_chest_xray_transform_for_inference(self, resize: int, center_crop_size: int) -> Compose: - """ - Defines the image transformation pipeline for Chest-Xray datasets. - - :param resize: The size to resize the image to. Linear resampling is used. - Resizing is applied on the axis with smaller shape. - :param center_crop_size: The size to center crop the image to. Square crop is applied. - """ - - transforms = [Resize(resize), CenterCrop(center_crop_size), ToTensor(), ExpandChannels()] - return Compose(transforms) - - @property - def lengths(self): - length_list = [] - for sample in self.list_data_dict: - img_tokens = 128 if 'image' in sample else 0 - length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens) - return length_list - - @property - def modality_lengths(self): - length_list = [] - for sample in self.list_data_dict: - cur_len = sum(len(conv['value'].split()) for conv in sample['conversations']) - cur_len = cur_len if 'image' in sample else -cur_len - length_list.append(cur_len) - return length_list - - def remap_to_uint8(self, array: np.ndarray, percentiles=None) -> np.ndarray: - """Remap values in input so the output range is :math:`[0, 255]`. - - Percentiles can be used to specify the range of values to remap. - This is useful to discard outliers in the input data. - - :param array: Input array. - :param percentiles: Percentiles of the input values that will be mapped to ``0`` and ``255``. - Passing ``None`` is equivalent to using percentiles ``(0, 100)`` (but faster). - :returns: Array with ``0`` and ``255`` as minimum and maximum values. - """ - array = array.astype(float) - if percentiles is not None: - len_percentiles = len(percentiles) - if len_percentiles != 2: - message = ( - 'The value for percentiles should be a sequence of length 2,' - f' but has length {len_percentiles}' - ) - raise ValueError(message) - a, b = percentiles - if a >= b: - raise ValueError(f'Percentiles must be in ascending order, but a sequence "{percentiles}" was passed') - if a < 0 or b > 100: - raise ValueError(f'Percentiles must be in the range [0, 100], but a sequence "{percentiles}" was passed') - cutoff: np.ndarray = np.percentile(array, percentiles) - array = np.clip(array, *cutoff) - array -= array.min() - array /= array.max() - array *= 255 - return array.astype(np.uint8) - - def load_image_biovil(self, image_folder, image_file) -> Image.Image: - """Load an image from disk. - - The image values are remapped to :math:`[0, 255]` and cast to 8-bit unsigned integers. - - :param path: Path to image. - :returns: Image as ``Pillow`` ``Image``. - """ - # Although ITK supports JPEG and PNG, we use Pillow for consistency with older trained models - if image_file.startswith('/home'): # full path - path = pathlib.Path(image_file) - elif image_file.startswith('files/'): # mimic-cxr - path = pathlib.Path(os.path.join(image_folder, image_file)) - else: - path = pathlib.Path("/home/guests/chantal_pellegrini/" + image_file) #radrestruct - if path.suffix in [".jpg", ".jpeg", ".png"]: - image = io.imread(path) - else: - raise ValueError(f"Image type not supported, filename was: {path}") - - image = self.remap_to_uint8(image) - return Image.fromarray(image).convert("L") - - def __getitem__(self, i) -> Dict[str, torch.Tensor]: - sources = self.list_data_dict[i] - if isinstance(i, int): - sources = [sources] - assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME - if 'image' in sources[0]: # 1 or multiple current images - image_files = self.list_data_dict[i]['image'] if type(self.list_data_dict[i]['image']) == list else [self.list_data_dict[i]['image']] #convert to list - image_folder = self.data_args.image_folder - - if self.do_img_order_augment: - random.shuffle(image_files) - n_images = random.randint(1,len(image_files)) - image_files = image_files[:n_images] - - if self.data_args.vision_tower == 'biovil': - images = [self.load_image_biovil(image_folder, image_file) for image_file in image_files] - # augment images - if self.augment is not None: - images = [self.augment(image) for image in images] - images = [self.vis_transforms_biovil(img) for img in images] - else: - processor = self.data_args.image_processor - images = [Image.open(os.path.join(image_folder, image_file)).convert('RGB') for image_file in image_files] - if self.data_args.image_aspect_ratio == 'pad': - def expand2square(pil_img, background_color): - width, height = pil_img.size - if width == height: - return pil_img - elif width > height: - result = Image.new(pil_img.mode, (width, width), background_color) - result.paste(pil_img, (0, (width - height) // 2)) - return result - else: - result = Image.new(pil_img.mode, (height, height), background_color) - result.paste(pil_img, ((height - width) // 2, 0)) - return result - images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images] - images = [processor.preprocess(image, return_tensors='pt')['pixel_values'][0] for image in images] - else: - images = [processor.preprocess(image, return_tensors='pt')['pixel_values'][0] for image in images] - # stack images - images = torch.stack(images, dim=0).squeeze() #stack and drop unnecessary dimension - - if 'prev_image' in sources[0]: # 1 or multiple previous images - raise NotImplementedError("Previous image is not supported yet") - prev_image_files = self.list_data_dict[i]['prev_image'] if type(self.list_data_dict[i]['prev_image']) == list else [ - self.list_data_dict[i]['prev_image']] # convert to list - image_folder = self.data_args.image_folder - - if self.data_args.vision_tower == 'biovil': - prev_images = [self.load_image_biovil(image_folder, image_file) for image_file in prev_image_files] - - prev_images = [self.vis_transforms_biovil(img) for img in prev_images] - else: - processor = self.data_args.image_processor - prev_images = [Image.open(os.path.join(image_folder, image_file)).convert('RGB') for image_file in prev_image_files] - if self.data_args.image_aspect_ratio == 'pad': - def expand2square(pil_img, background_color): - width, height = pil_img.size - if width == height: - return pil_img - elif width > height: - result = Image.new(pil_img.mode, (width, width), background_color) - result.paste(pil_img, (0, (width - height) // 2)) - return result - else: - result = Image.new(pil_img.mode, (height, height), background_color) - result.paste(pil_img, ((height - width) // 2, 0)) - return result - - prev_images = [expand2square(image, tuple(int(x * 255) for x in processor.image_mean)) for image in prev_images] - prev_images = [processor.preprocess(image, return_tensors='pt')['pixel_values'][0] for image in prev_images] - else: - prev_images = [processor.preprocess(image, return_tensors='pt')['pixel_values'][0] for image in prev_images] - - # stack images - prev_images = torch.stack(prev_images, dim=0).squeeze() if len(prev_images) > 0 else prev_images #stack and drop unnecessary dimension - - # drop images from the data - sources = preprocess_multimodal( - copy.deepcopy([e["conversations"] for e in sources]), - self.data_args) - - else: - sources = copy.deepcopy([e["conversations"] for e in sources]) - data_dict = preprocess( - sources, - self.tokenizer, - has_image=('image' in self.list_data_dict[i])) - if isinstance(i, int): - data_dict = dict(input_ids=data_dict["input_ids"][0], - labels=data_dict["labels"][0]) - - # image exist in the data - if 'image' in self.list_data_dict[i]: - data_dict['image'] = images[0] if len(images) == 1 else images - if 'prev_image' in self.list_data_dict[i]: - data_dict['prev_image'] = prev_images[0] if len(prev_images) == 1 else prev_images - if prev_images == []: - data_dict['prev_image'] = None - elif self.data_args.is_multimodal: - # image does not exist in the data, but the model is multimodal - crop_size = self.data_args.image_processor.crop_size - data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width']) - return data_dict - - -@dataclass -class DataCollatorForSupervisedDataset(object): - """Collate examples for supervised fine-tuning.""" - - tokenizer: transformers.PreTrainedTokenizer - - def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: - input_ids, labels = tuple([instance[key] for instance in instances] - for key in ("input_ids", "labels")) - input_ids = torch.nn.utils.rnn.pad_sequence( - input_ids, - batch_first=True, - padding_value=self.tokenizer.pad_token_id) - labels = torch.nn.utils.rnn.pad_sequence(labels, - batch_first=True, - padding_value=IGNORE_INDEX) - input_ids = input_ids[:, :self.tokenizer.model_max_length] - labels = labels[:, :self.tokenizer.model_max_length] - batch = dict( - input_ids=input_ids, - labels=labels, - attention_mask=input_ids.ne(self.tokenizer.pad_token_id), - ) - - if 'image' in instances[0]: - images = [instance['image'] for instance in instances] - if not 'prev_image' in instances[0] and all(x is not None and x.shape == images[0].shape for x in images): - batch['images'] = torch.stack(images) - else: - # extend the dimension of all images to 4 if it is only 3 (1x dimension to be treated as multi-image) - images = [image.unsqueeze(0) if len(image.shape) == 3 else image for image in images] - batch['images'] = images - - if 'prev_image' in instances[0]: - prev_images = [instance['prev_image'] for instance in instances] - prev_images = [image.unsqueeze(0) if len(image.shape) == 3 else image for image in prev_images] - batch['prev_images'] = prev_images - - return batch - - -def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, - data_args) -> Dict: - """Make dataset and collator for supervised fine-tuning.""" - train_dataset = LazySupervisedDataset(tokenizer=tokenizer, - data_path=data_args.data_path, - data_args=data_args) - data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) - return dict(train_dataset=train_dataset, - eval_dataset=None, - data_collator=data_collator) - - -def train(): - global local_rank - - parser = transformers.HfArgumentParser( - (ModelArguments, DataArguments, TrainingArguments)) - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - data_args.vision_tower = model_args.vision_tower #info for image preprocessing - local_rank = training_args.local_rank - compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) - - bnb_model_from_pretrained_args = {} - if training_args.bits in [4, 8]: - from transformers import BitsAndBytesConfig - bnb_model_from_pretrained_args.update(dict( - device_map={"": training_args.device}, - load_in_4bit=training_args.bits == 4, - load_in_8bit=training_args.bits == 8, - quantization_config=BitsAndBytesConfig( - load_in_4bit=training_args.bits == 4, - load_in_8bit=training_args.bits == 8, - llm_int8_skip_modules=["mm_projector", "image_pooler"], - llm_int8_threshold=6.0, - llm_int8_has_fp16_weight=False, - bnb_4bit_compute_dtype=compute_dtype, - bnb_4bit_use_double_quant=training_args.double_quant, - bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'} - ) - )) - - if model_args.vision_tower is not None: - if 'mpt' in model_args.model_name_or_path: - config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) - config.attn_config['attn_impl'] = training_args.mpt_attn_impl - model = LlavaMPTForCausalLM.from_pretrained( - model_args.model_name_or_path, - config=config, - cache_dir=training_args.cache_dir, - **bnb_model_from_pretrained_args - ) - else: - model = LlavaLlamaForCausalLM.from_pretrained( - model_args.model_name_or_path, - mv_type = model_args.mv_type, - mm_vision_tower=model_args.vision_tower, - cache_dir=training_args.cache_dir, - ignore_mismatched_sizes=True, - **bnb_model_from_pretrained_args - ) - else: - model = transformers.LlamaForCausalLM.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - **bnb_model_from_pretrained_args - ) - model.config.use_cache = False - - if model_args.freeze_backbone: - model.model.requires_grad_(False) - - if training_args.bits in [4, 8]: - from peft import prepare_model_for_kbit_training - model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) - model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) - - if training_args.gradient_checkpointing: - if hasattr(model, "enable_input_require_grads"): - model.enable_input_require_grads() - else: - def make_inputs_require_grad(module, input, output): - output.requires_grad_(True) - model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) - - if training_args.lora_enable: - from peft import LoraConfig, get_peft_model - lora_config = LoraConfig( - r=training_args.lora_r, - lora_alpha=training_args.lora_alpha, - target_modules=find_all_linear_names(model), - lora_dropout=training_args.lora_dropout, - bias=training_args.lora_bias, - task_type="CAUSAL_LM", - ) - if training_args.bits == 16: - if training_args.bf16: - model.to(torch.bfloat16) - if training_args.fp16: - model.to(torch.float16) - rank0_print("Adding LoRA adapters...") - model = get_peft_model(model, lora_config) - - if 'mpt' in model_args.model_name_or_path: - tokenizer = transformers.AutoTokenizer.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - model_max_length=training_args.model_max_length, - padding_side="right" - ) - else: - tokenizer = transformers.AutoTokenizer.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - model_max_length=training_args.model_max_length, - padding_side="right", - use_fast=False, - ) - - if model_args.version == "v0": - if tokenizer.pad_token is None: - smart_tokenizer_and_embedding_resize( - special_tokens_dict=dict(pad_token="[PAD]"), - tokenizer=tokenizer, - model=model, - ) - elif model_args.version == "v0.5": - tokenizer.pad_token = tokenizer.unk_token - else: - tokenizer.pad_token = tokenizer.unk_token - if model_args.version in conversation_lib.conv_templates: - conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version] - else: - conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"] - - if model_args.vision_tower is not None: - model.get_model().initialize_vision_modules( - model_args=model_args, - fsdp=training_args.fsdp - ) - - vision_tower = model.get_vision_tower() - vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) - - data_args.image_processor = vision_tower.image_processor - data_args.is_multimodal = True - - model.config.image_aspect_ratio = data_args.image_aspect_ratio - model.config.tokenizer_padding_side = tokenizer.padding_side - model.config.tokenizer_model_max_length = tokenizer.model_max_length - - model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter - if model_args.tune_mm_mlp_adapter: - model.requires_grad_(False) - for p in model.get_model().mm_projector.parameters(): - p.requires_grad = True - - model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter - if training_args.freeze_mm_mlp_adapter: - for p in model.get_model().mm_projector.parameters(): - p.requires_grad = False - - # reinitialize image_pooler - if model.get_model().image_pooler is not None: - model.get_model().image_pooler.bert = model.get_model().image_pooler.bert.apply(model.get_model().image_pooler.bert._init_weights) - - if training_args.bits in [4, 8]: - model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) - model.get_model().image_pooler.to(dtype=compute_dtype, device=training_args.device) - - model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end - model.config.mm_projector_lr = training_args.mm_projector_lr - training_args.use_im_start_end = model_args.mm_use_im_start_end - model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token - model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer) - - if training_args.unfreeze_n_vision_tower_layers is not None: - if data_args.vision_tower == 'biovil': - print(f'Unfreezing all vision tower layers') - for param in model.get_vision_tower().parameters(): - param.requires_grad = True - # unfreeze vit_pooler and last image encoder layer - # print(f'Unfreezing partial vision tower layers') - # for param in model.get_vision_tower().encoder.encoder.layer4.parameters(): - # param.requires_grad = True - # for param in model.get_vision_tower().encoder.encoder.fc.parameters(): - # param.requires_grad = True - # for param in model.get_vision_tower().encoder.vit_pooler.parameters(): - # param.requires_grad = True - else: - print(f'Unfreezing last {training_args.unfreeze_n_vision_tower_layers} layers of vision tower') - for layer in model.get_vision_tower().vision_tower.vision_model.encoder.layers[-training_args.unfreeze_n_vision_tower_layers:]: - for param in layer.parameters(): - param.requires_grad = True - - if training_args.bits in [4, 8]: - from peft.tuners.lora import LoraLayer - for name, module in model.named_modules(): - if isinstance(module, LoraLayer): - if training_args.bf16: - module = module.to(torch.bfloat16) - if 'norm' in name: - module = module.to(torch.float32) - if 'lm_head' in name or 'embed_tokens' in name: - if hasattr(module, 'weight'): - if training_args.bf16 and module.weight.dtype == torch.float32: - module = module.to(torch.bfloat16) - - data_module = make_supervised_data_module(tokenizer=tokenizer, - data_args=data_args) - - from transformers import TrainerCallback - class SaveCallback(TrainerCallback): - def on_save(self, args, state, control, **kwargs): - print("on save") - checkpoint_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(state.global_step)) - if args.lora_enable: - state_dict = get_peft_state_maybe_zero_3( - model.named_parameters(), training_args.lora_bias - ) - non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3_extended( - model - ) - print("Saving LoRA state dict...") - print(args.local_rank) - if args.local_rank in [-1, 0]: - model.config.save_pretrained(checkpoint_dir) - model.save_pretrained(checkpoint_dir, state_dict=state_dict) - print(checkpoint_dir) - torch.save(non_lora_state_dict, os.path.join(checkpoint_dir, 'non_lora_trainables.bin')) - - from llava.train.llama_patch import upcast_layer_for_flash_attention - model = upcast_layer_for_flash_attention(model, torch.bfloat16) - trainer = LLaVATrainer(model=model, - tokenizer=tokenizer, - args=training_args, - callbacks=[SaveCallback()], - **data_module) - - if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): - trainer.train(resume_from_checkpoint=True) - else: - trainer.train() - trainer.save_state() - - model.config.use_cache = True - - if training_args.lora_enable: - state_dict = get_peft_state_maybe_zero_3( - model.named_parameters(), training_args.lora_bias - ) - non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3_extended( - model - ) - if training_args.local_rank == 0 or training_args.local_rank == -1: - model.config.save_pretrained(training_args.output_dir) - model.save_pretrained(training_args.output_dir, state_dict=state_dict) - torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin')) - else: - safe_save_model_for_hf_trainer(trainer=trainer, - output_dir=training_args.output_dir) - - -if __name__ == "__main__": - train() diff --git a/LLAVA_Biovil/llava/train/train_mem.py b/LLAVA_Biovil/llava/train/train_mem.py deleted file mode 100644 index 2487d317855b27d5b07a755ee0389667e4964f02..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/train/train_mem.py +++ /dev/null @@ -1,13 +0,0 @@ -# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: -# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: -# Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. - -# Need to call this before importing transformers. -from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn - -replace_llama_attn_with_flash_attn() - -from llava.train.train import train - -if __name__ == "__main__": - train() diff --git a/LLAVA_Biovil/llava/train/train_xformers.py b/LLAVA_Biovil/llava/train/train_xformers.py deleted file mode 100644 index 90a82b09e273f65964f1a4e22dcdf61ea5fc0a12..0000000000000000000000000000000000000000 --- a/LLAVA_Biovil/llava/train/train_xformers.py +++ /dev/null @@ -1,13 +0,0 @@ -# Make it more memory efficient by monkey patching the LLaMA model with xformers attention. - -# Need to call this before importing transformers. -from LLAV.llava.train.llama_xformers_attn_monkey_patch import ( - replace_llama_attn_with_xformers_attn, -) - -replace_llama_attn_with_xformers_attn() - -from LLAV.llava.train.train import train - -if __name__ == "__main__": - train() diff --git a/LLAVA_Biovil/llava/utils.py b/LLAVA_Biovil/llava/utils.py index 6a9de097b66616cfae6af6d0ee78164d68fed0a7..fb79d14fb105a75758b326505c31f5d2cd36801d 100644 --- a/LLAVA_Biovil/llava/utils.py +++ b/LLAVA_Biovil/llava/utils.py @@ -5,7 +5,7 @@ import requests -from llava.constants import LOGDIR +from LLAVA_Biovil.llava.constants import LOGDIR server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." diff --git a/findings_classifier/chexpert_model.py b/findings_classifier/chexpert_model.py index 950222694ed3f94163e61c2ce94eb226ab60da4c..d5f497cebfa804c9daf28dfe8a51b5c96b6250a8 100644 --- a/findings_classifier/chexpert_model.py +++ b/findings_classifier/chexpert_model.py @@ -1,7 +1,7 @@ import torch from torch import nn -from biovil_t.pretrained import get_biovil_t_image_encoder +from LLAVA_Biovil.biovil_t.pretrained import get_biovil_t_image_encoder class ChexpertClassifier(nn.Module):