Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
saliency-based-citation/merge_gen_answers_with_eli5.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
72 lines (57 sloc)
2.07 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script is deprecated use rag_generate.py with the --output_format dataset option instead | |
""" | |
import argparse | |
import json | |
from pathlib import Path | |
import copy | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Replace answers in ELI5 dataset with generated answers') | |
parser.add_argument( | |
'--answers_path', | |
type=str, | |
default='./data/eli5_eval_bm25_top100_reranked_oracle_answers_llama31_70B_42.json', | |
help='Path to the file containing generated answers' | |
) | |
parser.add_argument( | |
'--eli5_path', | |
type=str, | |
default='./data/eli5_eval_bm25_top100_reranked_oracle.json', | |
help='Path to the original ELI5 dataset' | |
) | |
parser.add_argument( | |
'--output_path', | |
type=str, | |
default=None, | |
help='Path to save the modified dataset. If not provided, will append "_generated" to the original filename' | |
) | |
return parser.parse_args() | |
def load_json(file_path): | |
with open(file_path, 'r', encoding='utf-8') as f: | |
return json.load(f) | |
def save_json(data, file_path): | |
with open(file_path, 'w', encoding='utf-8') as f: | |
json.dump(data, f, indent=4, ensure_ascii=False) | |
def main(): | |
args = parse_args() | |
# Load the files | |
answers = load_json(args.answers_path) | |
eli5_data = load_json(args.eli5_path) | |
# Create a deep copy of the ELI5 dataset | |
modified_data = copy.deepcopy(eli5_data) | |
# Replace answers | |
for idx, item in enumerate(modified_data): | |
if idx < len(answers): | |
item['answer'] = answers[idx] | |
# Determine output path | |
if args.output_path: | |
output_path = args.output_path | |
else: | |
# Create output path by adding '_generated' before the extension | |
original_path = Path(args.eli5_path) | |
output_path = original_path.parent / f"{original_path.stem}_generated{original_path.suffix}" | |
# Save the modified dataset | |
save_json(modified_data, output_path) | |
print(f"Modified dataset saved to: {output_path}") | |
if __name__ == "__main__": | |
main() |