get_RPF_stats_ES_ensembles.py

import os
import pandas as pd
from tabulate import tabulate

# Get all directories in PDB_FILES that end with 'files'
pdb_dirs = sorted([d for d in os.listdir('./PDB_FILES') if os.path.isdir(os.path.join('./PDB_FILES', d)) and d.endswith('files')])

# Initialize lists to store results for table
table_data = []
headers = ['Directory', 'Avg DP', 'Avg RPF Recall', 'Avg RPF Precision', 'Files Processed']

for dir_name in pdb_dirs:
    dir_path = os.path.join('./PDB_FILES', dir_name)

    pdb_id = dir_name.split('_')[0]

    if pdb_id not in ['7jq8', '7jyn']:
        continue
    print(f"\nProcessing directory: {dir_path}")

    # Get CSP rank score file path
    csp_rank_score_file = f'./CSP_Rank_Scores/CSP_{pdb_id.lower()}_CSpred.csv'

    # Read the CSP rank scores file if it exists
    if os.path.exists(csp_rank_score_file):
        try:
            df = pd.read_csv(csp_rank_score_file)

            # Get all PDB files in the directory
            pdb_files = [f for f in os.listdir(dir_path) if f.endswith('.pdb')]

            # Initialize lists to store metrics for all files
            all_dp = []
            all_rpf_recall = []
            all_rpf_precision = []

            for pdb_file in pdb_files:
                # Find matching row in dataframe
                row = df[df['holo_model_path'].str.contains(pdb_file, regex=False)]

                if not row.empty:
                    all_dp.append(row['DP'].mean())
                    all_rpf_recall.append(row['RPF_RECALL'].mean())
                    all_rpf_precision.append(row['RPF_PRECISION'].mean())
                else:
                    print(f"No matching data found for {pdb_file}")

            if all_dp:  # Only add to table if we have data
                table_data.append([
                    dir_name,
                    f"{sum(all_dp)/len(all_dp):.3f}",
                    f"{sum(all_rpf_recall)/len(all_rpf_recall):.3f}",
                    f"{sum(all_rpf_precision)/len(all_rpf_precision):.3f}",
                    len(all_dp)
                ])

        except Exception as e:
            print(f"Error processing {csp_rank_score_file}: {e}")
    else:
        print(f"CSP rank score file not found for {pdb_id}")

# Sort table_data by directory name
table_data.sort(key=lambda x: x[0])

# Print final table
print("\nResults Summary:")
print(tabulate(table_data, headers=headers, tablefmt='grid'))
	import os
	import pandas as pd
	from tabulate import tabulate

	# Get all directories in PDB_FILES that end with 'files'
	pdb_dirs = sorted([d for d in os.listdir('./PDB_FILES') if os.path.isdir(os.path.join('./PDB_FILES', d)) and d.endswith('files')])

	# Initialize lists to store results for table
	table_data = []
	headers = ['Directory', 'Avg DP', 'Avg RPF Recall', 'Avg RPF Precision', 'Files Processed']

	for dir_name in pdb_dirs:
	dir_path = os.path.join('./PDB_FILES', dir_name)

	pdb_id = dir_name.split('_')[0]

	if pdb_id not in ['7jq8', '7jyn']:
	continue
	print(f"\nProcessing directory: {dir_path}")

	# Get CSP rank score file path
	csp_rank_score_file = f'./CSP_Rank_Scores/CSP_{pdb_id.lower()}_CSpred.csv'

	# Read the CSP rank scores file if it exists
	if os.path.exists(csp_rank_score_file):
	try:
	df = pd.read_csv(csp_rank_score_file)

	# Get all PDB files in the directory
	pdb_files = [f for f in os.listdir(dir_path) if f.endswith('.pdb')]

	# Initialize lists to store metrics for all files
	all_dp = []
	all_rpf_recall = []
	all_rpf_precision = []

	for pdb_file in pdb_files:
	# Find matching row in dataframe
	row = df[df['holo_model_path'].str.contains(pdb_file, regex=False)]

	if not row.empty:
	all_dp.append(row['DP'].mean())
	all_rpf_recall.append(row['RPF_RECALL'].mean())
	all_rpf_precision.append(row['RPF_PRECISION'].mean())
	else:
	print(f"No matching data found for {pdb_file}")

	if all_dp: # Only add to table if we have data
	table_data.append([
	dir_name,
	f"{sum(all_dp)/len(all_dp):.3f}",
	f"{sum(all_rpf_recall)/len(all_rpf_recall):.3f}",
	f"{sum(all_rpf_precision)/len(all_rpf_precision):.3f}",
	len(all_dp)
	])

	except Exception as e:
	print(f"Error processing {csp_rank_score_file}: {e}")
	else:
	print(f"CSP rank score file not found for {pdb_id}")

	# Sort table_data by directory name
	table_data.sort(key=lambda x: x[0])

	# Print final table
	print("\nResults Summary:")
	print(tabulate(table_data, headers=headers, tablefmt='grid'))