AF2vsNMR_ES_improvements.py

from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from adjustText import adjust_text
NMR_source_file = './CSPRANK.csv'

data = pd.read_csv(NMR_source_file)
holo_pdbs = data['holo_pdb'].values
apo_bmrbs = data['apo_bmrb'].values
AF2_CSPRANK = data['consensus_AF2'].values
NMR_CSPRANK = data['consensus_NMR'].values


# Remove entries where either AF2_CSPRANK or NMR_CSPRANK is nan
valid_indices = ~np.isnan(AF2_CSPRANK) & ~np.isnan(NMR_CSPRANK)
holo_pdbs = holo_pdbs[valid_indices]
apo_bmrbs = apo_bmrbs[valid_indices]
AF2_CSPRANK = AF2_CSPRANK[valid_indices]
NMR_CSPRANK = NMR_CSPRANK[valid_indices]

assert len(NMR_CSPRANK) == len(AF2_CSPRANK), "Data length mismatch."

# PLOT IMPROVEMENTS WITH ES
AF_NMR_CSPRANK = data['AF-NMR_consensus'].values
AF_NMR_MAX_CSPRANK = data['AF-NMR_max_consensus'].values
AF_NMR_CSPRANK = AF_NMR_MAX_CSPRANK

# Create the scatter plot
plt.figure(figsize=(20, 20))
plt.scatter(NMR_CSPRANK, AF2_CSPRANK, color='b', label='top-rank AF2 vs NMR Medoid')
plt.scatter(NMR_CSPRANK, AF_NMR_CSPRANK, color='g', label='Best AF+CSP vs NMR Medoid')

for i in range(len(NMR_CSPRANK)):
    if AF_NMR_CSPRANK[i] > AF2_CSPRANK[i] + 0.015:
        plt.arrow(NMR_CSPRANK[i], AF2_CSPRANK[i], 0, AF_NMR_CSPRANK[i] - AF2_CSPRANK[i] - 0.015,
              head_width=0.01, head_length=0.01, fc='k', ec='k')
    elif AF_NMR_CSPRANK[i] < AF2_CSPRANK[i] - 0.015:
        plt.arrow(NMR_CSPRANK[i], AF2_CSPRANK[i], 0, AF_NMR_CSPRANK[i] - AF2_CSPRANK[i] + 0.015,
              head_width=0.01, head_length=0.01, fc='k', ec='k')
    else:
        plt.arrow(NMR_CSPRANK[i], AF2_CSPRANK[i], 0, AF_NMR_CSPRANK[i] - AF2_CSPRANK[i],
              head_width=0.01, head_length=0.01, fc='k', ec='k')

valid_text_indices = ~np.isnan(AF_NMR_CSPRANK)
texts = [plt.text(NMR_CSPRANK[i], AF_NMR_CSPRANK[i]+0.015, ' ' + txt, fontsize=20) for i, txt in enumerate(holo_pdbs) if valid_text_indices[i]]
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))

# Add labels and title with bigger font
plt.xlabel('NMR Medoid CSP_RANK Scores', fontsize=20)
plt.ylabel('AF2 CSP_RANK Scores', fontsize=20)
plt.title('Improvement of CSP_RANK Scores using AF+CSP', fontsize=20)
plt.legend()

# Set the x and y axis limits to [0, 1]
plt.xlim(0, 1)
plt.ylim(0, 1)

# Set the x and y axis ticks font size
plt.tick_params(axis='both', which='major', labelsize=15)

# Plot the x=y line in red
plt.plot([0, 1], [0, 1], 'r-', label='x=y')

plt.show()

raise
	from os import listdir
	from os.path import isfile, join
	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd
	from adjustText import adjust_text
	NMR_source_file = './CSPRANK.csv'

	data = pd.read_csv(NMR_source_file)
	holo_pdbs = data['holo_pdb'].values
	apo_bmrbs = data['apo_bmrb'].values
	AF2_CSPRANK = data['consensus_AF2'].values
	NMR_CSPRANK = data['consensus_NMR'].values


	# Remove entries where either AF2_CSPRANK or NMR_CSPRANK is nan
	valid_indices = ~np.isnan(AF2_CSPRANK) & ~np.isnan(NMR_CSPRANK)
	holo_pdbs = holo_pdbs[valid_indices]
	apo_bmrbs = apo_bmrbs[valid_indices]
	AF2_CSPRANK = AF2_CSPRANK[valid_indices]
	NMR_CSPRANK = NMR_CSPRANK[valid_indices]

	assert len(NMR_CSPRANK) == len(AF2_CSPRANK), "Data length mismatch."

	# PLOT IMPROVEMENTS WITH ES
	AF_NMR_CSPRANK = data['AF-NMR_consensus'].values
	AF_NMR_MAX_CSPRANK = data['AF-NMR_max_consensus'].values
	AF_NMR_CSPRANK = AF_NMR_MAX_CSPRANK

	# Create the scatter plot
	plt.figure(figsize=(20, 20))
	plt.scatter(NMR_CSPRANK, AF2_CSPRANK, color='b', label='top-rank AF2 vs NMR Medoid')
	plt.scatter(NMR_CSPRANK, AF_NMR_CSPRANK, color='g', label='Best AF+CSP vs NMR Medoid')

	for i in range(len(NMR_CSPRANK)):
	if AF_NMR_CSPRANK[i] > AF2_CSPRANK[i] + 0.015:
	plt.arrow(NMR_CSPRANK[i], AF2_CSPRANK[i], 0, AF_NMR_CSPRANK[i] - AF2_CSPRANK[i] - 0.015,
	head_width=0.01, head_length=0.01, fc='k', ec='k')
	elif AF_NMR_CSPRANK[i] < AF2_CSPRANK[i] - 0.015:
	plt.arrow(NMR_CSPRANK[i], AF2_CSPRANK[i], 0, AF_NMR_CSPRANK[i] - AF2_CSPRANK[i] + 0.015,
	head_width=0.01, head_length=0.01, fc='k', ec='k')
	else:
	plt.arrow(NMR_CSPRANK[i], AF2_CSPRANK[i], 0, AF_NMR_CSPRANK[i] - AF2_CSPRANK[i],
	head_width=0.01, head_length=0.01, fc='k', ec='k')

	valid_text_indices = ~np.isnan(AF_NMR_CSPRANK)
	texts = [plt.text(NMR_CSPRANK[i], AF_NMR_CSPRANK[i]+0.015, ' ' + txt, fontsize=20) for i, txt in enumerate(holo_pdbs) if valid_text_indices[i]]
	adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))

	# Add labels and title with bigger font
	plt.xlabel('NMR Medoid CSP_RANK Scores', fontsize=20)
	plt.ylabel('AF2 CSP_RANK Scores', fontsize=20)
	plt.title('Improvement of CSP_RANK Scores using AF+CSP', fontsize=20)
	plt.legend()

	# Set the x and y axis limits to [0, 1]
	plt.xlim(0, 1)
	plt.ylim(0, 1)

	# Set the x and y axis ticks font size
	plt.tick_params(axis='both', which='major', labelsize=15)

	# Plot the x=y line in red
	plt.plot([0, 1], [0, 1], 'r-', label='x=y')

	plt.show()

	raise