Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
saliency-based-citation/to_mirage_format.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
74 lines (65 sloc)
3.14 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import argparse | |
import os | |
from tqdm import tqdm | |
def convert_eli5(input_file): | |
with open(input_file, 'r') as f: | |
data = json.load(f) | |
converted_data = [] | |
for item in tqdm(data, desc="Converting ELI5 data"): | |
converted_item = { | |
"question": item["question"], | |
"docs": [{"title": doc["title"], "text": doc["text"]} for doc in item["docs"]], | |
"output": item["answer"], | |
"answer": item["answer"], | |
"claims": item["claims"] | |
} | |
converted_data.append(converted_item) | |
return converted_data | |
def convert_xorattriqa(input_file): | |
converted_data = [] | |
with open(input_file, 'r') as f: | |
for line in tqdm(f, desc=f"Converting {os.path.basename(input_file)}"): | |
item = json.loads(line) | |
answers = eval(item["answers_translated_en"]) | |
for answer in answers: | |
converted_item = { | |
"question": item["query_translated_en"].replace('[b]', '').strip(), | |
"docs": [{"title": "NO TITLE", "text": item["passage_en"]}], | |
"output": answer, | |
'attributable_gt': item['ais'] | |
} | |
converted_data.append(converted_item) | |
return converted_data | |
def main(): | |
parser = argparse.ArgumentParser(description="Convert ELI5 or XORAttriQA dataset to specified format") | |
parser.add_argument("dataset", choices=["eli5", "xorattriqa"], help="Dataset to convert") | |
parser.add_argument("input_path", help="Path to input file or directory") | |
parser.add_argument("--output_dir", default="data_input_with_ans", help="Output directory") | |
args = parser.parse_args() | |
os.makedirs(args.output_dir, exist_ok=True) | |
if args.dataset == "eli5": | |
converted_data = convert_eli5(args.input_path) | |
output_file = os.path.join(args.output_dir, "eli5.json") | |
with open(output_file, 'w') as f: | |
json.dump(converted_data, f, indent=2) | |
print(f"Conversion complete. Output saved to {output_file}") | |
else: # xorattriqa | |
if os.path.isdir(args.input_path): | |
jsonl_files = [f for f in os.listdir(args.input_path) if f.endswith('.jsonl')] | |
jsonl_files = [f for f in jsonl_files if 'train' not in f and 'val' not in f and 'toy' not in f] | |
for jsonl_file in jsonl_files: | |
input_file = os.path.join(args.input_path, jsonl_file) | |
converted_data = convert_xorattriqa(input_file) | |
output_file = os.path.join(args.output_dir, f"xorattriqa_{os.path.splitext(jsonl_file)[0]}.json") | |
with open(output_file, 'w') as f: | |
json.dump(converted_data, f, indent=2) | |
print(f"Conversion complete for {jsonl_file}. Output saved to {output_file}") | |
else: | |
converted_data = convert_xorattriqa(args.input_path) | |
output_file = os.path.join(args.output_dir, "xorattriqa.json") | |
with open(output_file, 'w') as f: | |
json.dump(converted_data, f, indent=2) | |
print(f"Conversion complete. Output saved to {output_file}") | |
if __name__ == "__main__": | |
main() |