Skip to content
Permalink
4d623e0e9d
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
74 lines (65 sloc) 3.14 KB
import json
import argparse
import os
from tqdm import tqdm
def convert_eli5(input_file):
with open(input_file, 'r') as f:
data = json.load(f)
converted_data = []
for item in tqdm(data, desc="Converting ELI5 data"):
converted_item = {
"question": item["question"],
"docs": [{"title": doc["title"], "text": doc["text"]} for doc in item["docs"]],
"output": item["answer"],
"answer": item["answer"],
"claims": item["claims"]
}
converted_data.append(converted_item)
return converted_data
def convert_xorattriqa(input_file):
converted_data = []
with open(input_file, 'r') as f:
for line in tqdm(f, desc=f"Converting {os.path.basename(input_file)}"):
item = json.loads(line)
answers = eval(item["answers_translated_en"])
for answer in answers:
converted_item = {
"question": item["query_translated_en"].replace('[b]', '').strip(),
"docs": [{"title": "NO TITLE", "text": item["passage_en"]}],
"output": answer,
'attributable_gt': item['ais']
}
converted_data.append(converted_item)
return converted_data
def main():
parser = argparse.ArgumentParser(description="Convert ELI5 or XORAttriQA dataset to specified format")
parser.add_argument("dataset", choices=["eli5", "xorattriqa"], help="Dataset to convert")
parser.add_argument("input_path", help="Path to input file or directory")
parser.add_argument("--output_dir", default="data_input_with_ans", help="Output directory")
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
if args.dataset == "eli5":
converted_data = convert_eli5(args.input_path)
output_file = os.path.join(args.output_dir, "eli5.json")
with open(output_file, 'w') as f:
json.dump(converted_data, f, indent=2)
print(f"Conversion complete. Output saved to {output_file}")
else: # xorattriqa
if os.path.isdir(args.input_path):
jsonl_files = [f for f in os.listdir(args.input_path) if f.endswith('.jsonl')]
jsonl_files = [f for f in jsonl_files if 'train' not in f and 'val' not in f and 'toy' not in f]
for jsonl_file in jsonl_files:
input_file = os.path.join(args.input_path, jsonl_file)
converted_data = convert_xorattriqa(input_file)
output_file = os.path.join(args.output_dir, f"xorattriqa_{os.path.splitext(jsonl_file)[0]}.json")
with open(output_file, 'w') as f:
json.dump(converted_data, f, indent=2)
print(f"Conversion complete for {jsonl_file}. Output saved to {output_file}")
else:
converted_data = convert_xorattriqa(args.input_path)
output_file = os.path.join(args.output_dir, "xorattriqa.json")
with open(output_file, 'w') as f:
json.dump(converted_data, f, indent=2)
print(f"Conversion complete. Output saved to {output_file}")
if __name__ == "__main__":
main()