Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add llava/eval from VILA-Internal and edit the README.md #3

Merged
merged 2 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ python -W ignore llava/eval/run_llava.py \
--model-name Efficient-Large-Model/VILA-7B \
--conv-mode vicuna_v1 \
--query "<image>\n Please describe the traffic condition." \
--image-file "av.png"
--image-file "demo_images/av.png"
```

VILA-13B inference:
Expand All @@ -180,7 +180,7 @@ python -W ignore llava/eval/run_llava.py \
--model-name Efficient-Large-Model/VILA-13B \
--conv-mode vicuna_v1 \
--query "<image>\n Please describe the traffic condition." \
--image-file "av.png"
--image-file "demo_images/av.png"
```

## Quantization and Deployment
Expand Down
Binary file added demo_images/av.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
115 changes: 115 additions & 0 deletions llava/eval/eval_gpt_review.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# This file is modified from https://github.com/haotian-liu/LLaVA/

import argparse
import json
import os

import openai
import tqdm
import ray
import time

NUM_SECONDS_TO_SLEEP = 3

@ray.remote(num_cpus=4)
def get_eval(content: str, max_tokens: int):
while True:
try:
response = openai.chat.completions.create(
model='gpt-4',
messages=[{
'role': 'system',
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
}, {
'role': 'user',
'content': content,
}],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
break
except openai.error.RateLimitError:
pass
except Exception as e:
print(e)
time.sleep(NUM_SECONDS_TO_SLEEP)

print('success!')
return response.choices[0].message.content


def parse_score(review):
try:
score_pair = review.split('\n')[0]
score_pair = score_pair.replace(',', ' ')
sp = score_pair.split(' ')
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
print('error', review)
return [-1, -1]
except Exception as e:
print(e)
print('error', review)
return [-1, -1]


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
parser.add_argument('-q', '--question')
# parser.add_argument('-a', '--answer')
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
parser.add_argument('-r', '--rule')
parser.add_argument('-o', '--output')
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
args = parser.parse_args()

ray.init()

f_q = open(os.path.expanduser(args.question))
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))

review_file = open(f'{args.output}', 'w')

js_list = []
handles = []
idx = 0
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
# if idx == 1:
# break

ques = json.loads(ques_js)
ans1 = json.loads(ans1_js)
ans2 = json.loads(ans2_js)

category = json.loads(ques_js)['category']
if category in rule_dict:
rule = rule_dict[category]
else:
rule = rule_dict['default']
prompt = rule['prompt']
role = rule['role']
content = (f'[Question]\n{ques["text"]}\n\n'
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
f'[System]\n{prompt}\n\n')
js_list.append({
'id': idx+1,
'question_id': ques['question_id'],
'answer1_id': ans1['answer_id'],
'answer2_id': ans2['answer_id'],
'category': category})
idx += 1
handles.append(get_eval.remote(content, args.max_tokens))
# To avoid the rate limit set by OpenAI
time.sleep(NUM_SECONDS_TO_SLEEP)

reviews = ray.get(handles)
for idx, review in enumerate(reviews):
scores = parse_score(review)
js_list[idx]['content'] = review
js_list[idx]['tuple'] = scores
review_file.write(json.dumps(js_list[idx]) + '\n')
review_file.close()
123 changes: 123 additions & 0 deletions llava/eval/eval_gpt_review_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# This file is modified from https://github.com/haotian-liu/LLaVA/

import argparse
import json
import os

import openai
import time

NUM_SECONDS_TO_SLEEP = 0.5


def get_eval(content: str, max_tokens: int):
while True:
try:
response = openai.chat.completions.create(
model='gpt-4-0613',
messages=[{
'role': 'system',
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
}, {
'role': 'user',
'content': content,
}],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
break
except openai.error.RateLimitError:
pass
except Exception as e:
print(e)
time.sleep(NUM_SECONDS_TO_SLEEP)

return response.choices[0].message.content


def parse_score(review):
try:
score_pair = review.split('\n')[0]
score_pair = score_pair.replace(',', ' ')
sp = score_pair.split(' ')
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
print('error', review)
return [-1, -1]
except Exception as e:
print(e)
print('error', review)
return [-1, -1]


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
parser.add_argument('-q', '--question')
parser.add_argument('-c', '--context')
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
parser.add_argument('-r', '--rule')
parser.add_argument('-o', '--output')
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
args = parser.parse_args()

f_q = open(os.path.expanduser(args.question))
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))

if os.path.isfile(os.path.expanduser(args.output)):
cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
else:
cur_reviews = []

review_file = open(f'{args.output}', 'a')

context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
image_to_context = {context['image']: context for context in context_list}

handles = []
idx = 0
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
ques = json.loads(ques_js)
ans1 = json.loads(ans1_js)
ans2 = json.loads(ans2_js)

inst = image_to_context[ques['image']]

if isinstance(inst['caption'], list):
cap_str = '\n'.join(inst['caption'])
else:
cap_str = inst['caption']

category = 'llava_bench_' + json.loads(ques_js)['category']
if category in rule_dict:
rule = rule_dict[category]
else:
assert False, f"Visual QA category not found in rule file: {category}."
prompt = rule['prompt']
role = rule['role']
content = (f'[Context]\n{cap_str}\n\n'
f'[Question]\n{ques["text"]}\n\n'
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
f'[System]\n{prompt}\n\n')
cur_js = {
'id': idx+1,
'question_id': ques['question_id'],
'answer1_id': ans1.get('answer_id', ans1['question_id']),
'answer2_id': ans2.get('answer_id', ans2['answer_id']),
'category': category
}
if idx >= len(cur_reviews):
review = get_eval(content, args.max_tokens)
scores = parse_score(review)
cur_js['content'] = review
cur_js['tuple'] = scores
review_file.write(json.dumps(cur_js) + '\n')
review_file.flush()
else:
print(f'Skipping {idx} as we already have it.')
idx += 1
print(idx)
review_file.close()
120 changes: 120 additions & 0 deletions llava/eval/eval_gpt_review_visual.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# This file is modified from https://github.com/haotian-liu/LLaVA/

import argparse
import json
import os

import openai
import time

NUM_SECONDS_TO_SLEEP = 0.5


def get_eval(content: str, max_tokens: int):
while True:
try:
response = openai.chat.completions.create(
model='gpt-4-0314',
messages=[{
'role': 'system',
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
}, {
'role': 'user',
'content': content,
}],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
break
except openai.error.RateLimitError:
pass
except Exception as e:
print(e)
time.sleep(NUM_SECONDS_TO_SLEEP)

return response.choices[0].message.content


def parse_score(review):
try:
score_pair = review.split('\n')[0]
score_pair = score_pair.replace(',', ' ')
sp = score_pair.split(' ')
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
print('error', review)
return [-1, -1]
except Exception as e:
print(e)
print('error', review)
return [-1, -1]


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
parser.add_argument('-q', '--question')
parser.add_argument('-c', '--context')
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
parser.add_argument('-r', '--rule')
parser.add_argument('-o', '--output')
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
args = parser.parse_args()

f_q = open(os.path.expanduser(args.question))
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))

if os.path.isfile(os.path.expanduser(args.output)):
cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
else:
cur_reviews = []

review_file = open(f'{args.output}', 'a')

context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
image_to_context = {context['image']: context for context in context_list}

handles = []
idx = 0
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
ques = json.loads(ques_js)
ans1 = json.loads(ans1_js)
ans2 = json.loads(ans2_js)

inst = image_to_context[ques['image']]
cap_str = '\n'.join(inst['captions'])
box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])

category = json.loads(ques_js)['category']
if category in rule_dict:
rule = rule_dict[category]
else:
assert False, f"Visual QA category not found in rule file: {category}."
prompt = rule['prompt']
role = rule['role']
content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
f'[Question]\n{ques["text"]}\n\n'
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
f'[System]\n{prompt}\n\n')
cur_js = {
'id': idx+1,
'question_id': ques['question_id'],
'answer1_id': ans1.get('answer_id', ans1['question_id']),
'answer2_id': ans2.get('answer_id', ans2['answer_id']),
'category': category
}
if idx >= len(cur_reviews):
review = get_eval(content, args.max_tokens)
scores = parse_score(review)
cur_js['content'] = review
cur_js['tuple'] = scores
review_file.write(json.dumps(cur_js) + '\n')
review_file.flush()
else:
print(f'Skipping {idx} as we already have it.')
idx += 1
print(idx)
review_file.close()
Loading
Loading