getDP.py

import os
import re
import subprocess
import sys
import pandas as pd
import csv
import ast

def parse_list(value):
    try:
        return ast.literal_eval(value)
    except ValueError:
        return value
    except SyntaxError:
        return value

def parse_csv(file_name):
    data = []
    with open(file_name, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append({k: parse_list(v) for k, v in row.items()})
    return data


def update_row(csv_filename, apo, bound, new_values, new_columns):
    try:
        # Load the DataFrame if the CSV file exists
        df = pd.read_csv(csv_filename, low_memory=False)
    except (pd.errors.EmptyDataError, FileNotFoundError):
        # Create an empty DataFrame if the CSV file is empty or doesn't exist
        df = pd.DataFrame()

    data_dict = {col: val for col, val in zip(new_columns, new_values)}
    #print(data_dict)
    #print(data_dict.items())

    # Check if 'apo' and 'bound' columns exist
    if 'apo_bmrb' not in df.columns or 'holo_model_path' not in df.columns:
        df = df._append(data_dict, ignore_index=True)
    else:
        # Update or create the row
        row_index = df[(df['holo_model_path'] == bound)].index
        print("UPDATING ROW INDEX : " + str(row_index))
        if not row_index.empty:
            for col, val in data_dict.items():
                try:
                    df.loc[row_index[0], col] = val
                except:
                    data_dict['apo_bmrb'] = apo
                    data_dict['holo_model_path'] = bound
                    df = df._append(data_dict, ignore_index=True)
        else:
            data_dict['apo_bmrb'] = apo
            data_dict['holo_model_path'] = bound
            df = df._append(data_dict, ignore_index=True)

    # Save the DataFrame back to the CSV file
    df.to_csv(csv_filename, index=False)
if len(sys.argv) < 2:
    print("Usage: python getDP.py <pdb>")
    sys.exit(1)

pdb = sys.argv[1].upper()
directory = f"./{pdb}/"

data_source_file = f'../CSP_Rank_Scores/CSP_{pdb.lower()}_CSpred.csv'
parsed_data = parse_csv(data_source_file)

apos = [str(data['apo_bmrb']) for data in parsed_data]
bounds = [data['holo_pdb'] for data in parsed_data]
apo = apos[1]

for root, dirs, files in os.walk(directory):
    for name in dirs:
        if re.search(r"\.pdb", name):
            try:
                dn = os.path.abspath(os.path.join(root, name))
                if not os.path.exists(dn):
                    continue
                cmd = "grep -E 'DP|Recall|Precision' " + dn + "/run1/*ovw"
                p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
                text = p.communicate()[0]
                text = text.decode('ISO-8859-1')
                # print name
                pattern = "Final Recall-score for input query structures: -*\d\.\d+"
                RPF_RECALL = re.search(pattern, text)
                RPF_RECALL = float(RPF_RECALL.group(0)[RPF_RECALL.group(0).rfind(':') + 1:])
                pattern = "Final Precision-score for input query structures: -*\d\.\d+"
                RPF_PRECISION = re.search(pattern, text)
                RPF_PRECISION = float(RPF_PRECISION.group(0)[RPF_PRECISION.group(0).rfind(':') + 1:])
                dp = re.search('DP-Score: -*\d\.\d+', text)
                dp = float(dp.group(0)[dp.group(0).rfind(':') + 1:])
                bound_path = './PDB_FILES/'+pdb+'_aligned/'+name.replace('_one_chain', '')
                new_values = [dp, RPF_RECALL, RPF_PRECISION]
                new_columns = ['DP', 'RPF_RECALL', 'RPF_PRECISION']
                print("new_values = " + str(new_values))
                print("bound_path = " + bound_path)
                update_row(data_source_file, apo.upper(), bound_path, new_values, new_columns)
            except:
                continue
	import os
	import re
	import subprocess
	import sys
	import pandas as pd
	import csv
	import ast

	def parse_list(value):
	try:
	return ast.literal_eval(value)
	except ValueError:
	return value
	except SyntaxError:
	return value

	def parse_csv(file_name):
	data = []
	with open(file_name, newline='') as csvfile:
	reader = csv.DictReader(csvfile)
	for row in reader:
	data.append({k: parse_list(v) for k, v in row.items()})
	return data


	def update_row(csv_filename, apo, bound, new_values, new_columns):
	try:
	# Load the DataFrame if the CSV file exists
	df = pd.read_csv(csv_filename, low_memory=False)
	except (pd.errors.EmptyDataError, FileNotFoundError):
	# Create an empty DataFrame if the CSV file is empty or doesn't exist
	df = pd.DataFrame()

	data_dict = {col: val for col, val in zip(new_columns, new_values)}
	#print(data_dict)
	#print(data_dict.items())

	# Check if 'apo' and 'bound' columns exist
	if 'apo_bmrb' not in df.columns or 'holo_model_path' not in df.columns:
	df = df._append(data_dict, ignore_index=True)
	else:
	# Update or create the row
	row_index = df[(df['holo_model_path'] == bound)].index
	print("UPDATING ROW INDEX : " + str(row_index))
	if not row_index.empty:
	for col, val in data_dict.items():
	try:
	df.loc[row_index[0], col] = val
	except:
	data_dict['apo_bmrb'] = apo
	data_dict['holo_model_path'] = bound
	df = df._append(data_dict, ignore_index=True)
	else:
	data_dict['apo_bmrb'] = apo
	data_dict['holo_model_path'] = bound
	df = df._append(data_dict, ignore_index=True)

	# Save the DataFrame back to the CSV file
	df.to_csv(csv_filename, index=False)
	if len(sys.argv) < 2:
	print("Usage: python getDP.py <pdb>")
	sys.exit(1)

	pdb = sys.argv[1].upper()
	directory = f"./{pdb}/"

	data_source_file = f'../CSP_Rank_Scores/CSP_{pdb.lower()}_CSpred.csv'
	parsed_data = parse_csv(data_source_file)

	apos = [str(data['apo_bmrb']) for data in parsed_data]
	bounds = [data['holo_pdb'] for data in parsed_data]
	apo = apos[1]

	for root, dirs, files in os.walk(directory):
	for name in dirs:
	if re.search(r"\.pdb", name):
	try:
	dn = os.path.abspath(os.path.join(root, name))
	if not os.path.exists(dn):
	continue
	cmd = "grep -E 'DP\|Recall\|Precision' " + dn + "/run1/*ovw"
	p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
	text = p.communicate()[0]
	text = text.decode('ISO-8859-1')
	# print name
	pattern = "Final Recall-score for input query structures: -*\d\.\d+"
	RPF_RECALL = re.search(pattern, text)
	RPF_RECALL = float(RPF_RECALL.group(0)[RPF_RECALL.group(0).rfind(':') + 1:])
	pattern = "Final Precision-score for input query structures: -*\d\.\d+"
	RPF_PRECISION = re.search(pattern, text)
	RPF_PRECISION = float(RPF_PRECISION.group(0)[RPF_PRECISION.group(0).rfind(':') + 1:])
	dp = re.search('DP-Score: -*\d\.\d+', text)
	dp = float(dp.group(0)[dp.group(0).rfind(':') + 1:])
	bound_path = './PDB_FILES/'+pdb+'_aligned/'+name.replace('_one_chain', '')
	new_values = [dp, RPF_RECALL, RPF_PRECISION]
	new_columns = ['DP', 'RPF_RECALL', 'RPF_PRECISION']
	print("new_values = " + str(new_values))
	print("bound_path = " + bound_path)
	update_row(data_source_file, apo.upper(), bound_path, new_values, new_columns)
	except:
	continue