Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
"""
This script is deprecated use rag_generate.py with the --output_format dataset option instead
"""
import argparse
import json
from pathlib import Path
import copy
def parse_args():
parser = argparse.ArgumentParser(description='Replace answers in ELI5 dataset with generated answers')
parser.add_argument(
'--answers_path',
type=str,
default='./data/eli5_eval_bm25_top100_reranked_oracle_answers_llama31_70B_42.json',
help='Path to the file containing generated answers'
)
parser.add_argument(
'--eli5_path',
type=str,
default='./data/eli5_eval_bm25_top100_reranked_oracle.json',
help='Path to the original ELI5 dataset'
)
parser.add_argument(
'--output_path',
type=str,
default=None,
help='Path to save the modified dataset. If not provided, will append "_generated" to the original filename'
)
return parser.parse_args()
def load_json(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def save_json(data, file_path):
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)
def main():
args = parse_args()
# Load the files
answers = load_json(args.answers_path)
eli5_data = load_json(args.eli5_path)
# Create a deep copy of the ELI5 dataset
modified_data = copy.deepcopy(eli5_data)
# Replace answers
for idx, item in enumerate(modified_data):
if idx < len(answers):
item['answer'] = answers[idx]
# Determine output path
if args.output_path:
output_path = args.output_path
else:
# Create output path by adding '_generated' before the extension
original_path = Path(args.eli5_path)
output_path = original_path.parent / f"{original_path.stem}_generated{original_path.suffix}"
# Save the modified dataset
save_json(modified_data, output_path)
print(f"Modified dataset saved to: {output_path}")
if __name__ == "__main__":
main()