getdata.py


import argparse
import json
import os
import re
import sys


bmrb_addr = 'https://bmrb.io/ftp/pub/bmrb/entry_directories/bmr'


def bruker_acqus(acqus=None):
	assert(acqus != None)

	if not os.path.isfile(os.path.abspath(acqus)): return False

	with open(acqus, 'r') as fp:
		for line in fp.readlines():
			line = line.rstrip()
			if '##$PULPROG= ' in line:
				info  = line.split('= ')
				pulse = info[-1]
				pulse = pulse.replace('<','')
				pulse = pulse.replace('>','')

				return pulse
	return False


def varian_procpar(procpar=None):
	assert(procpar != None)

	if not os.path.isfile(os.path.abspath(procpar)): return False

	with open(procpar, 'r') as fp:
		seqfil = False
		for line in fp.readlines():
			line = line.rstrip()
			if re.search(r'^seqfil ', line):
				seqfil = True
				continue

			if seqfil:
				info  = line.split()
				pulse = info[1]
				pulse = pulse.replace('"','')

				return pulse

	return False


def get_data(bmrbid=None, bmrbpath=None, output=None):
	data_path = os.path.join(
		os.path.abspath(bmrbpath), f'{bmrbid}', 'timedomain_data')
	if not os.path.isdir(data_path):
		return False, f'bmrb id {bmrbid} does not have timedomain_data'

	out = os.path.join(
		os.path.abspath(output),
		f'{bmrbid}')
	cmd = f'mkdir -p {out}'
	os.system(cmd)

	for root, dirs, files in os.walk(data_path):
		if 'fid' in files or 'ser' in files:

			if 'procpar' in files:
				pulse = varian_procpar(procpar=os.path.join(root,'procpar'))
				if not pulse: return False, f'no seqfil?\naborting'
			elif 'acqus' in files:
				pulse = bruker_acqus(acqus=os.path.join(root,'acqus'))
				if not pulse: return False, f'no pulprog?\naborting'
			else:
				return False, ''.join(
					f'no acquistion/param files?\npath: {root}\naborting')

			#if 'noesy' in pulse: continue

			source_folder = root.split('timedomain_data')[-1]
			source_folder = source_folder.replace('/','_')
			#source_folder = f'{pulse}_{source_folder}.tar.gz'
			exp_dir = os.path.join(
				out,
				f'{pulse}_{source_folder}')
			cmd = f'mkdir -p {exp_dir}'
			os.system(cmd)

			cmd = f'cp {root}/* {exp_dir} 2>/dev/null'
			#cmd = f'tar -czf {out}/{source_folder} --no-recursion {root}/*'
			os.system(cmd)

			if 'fid' in files:
				cmd = f'tar -czf - {exp_dir}/fid | split -b49M - {exp_dir}/sfid.'
			else:
				cmd = f'tar -czf - {exp_dir}/ser | split -b49M - {exp_dir}/sser.'

			os.system(cmd)
			cmd = f'rm {exp_dir}/fid'
			os.system(cmd)
			cmd = f'rm {exp_dir}/ser'
			os.system(cmd)
			continue

	return True, True


parser = argparse.ArgumentParser(description='Collect FIDs')
parser.add_argument('--artinaset', '-a', required=False, type=str,
	metavar='<str>',
	help='path to artina set meta data, JSON file included with repository')
parser.add_argument('--bmrb', '-b', required=False, type=str,
	metavar='<str>', help='path to BMRB data rsync directory')
parser.add_argument('--output', '-o', required=True, type=str,
	metavar='<str>', help='path to where to save fid datasets collected')
parser.add_argument('--num', '-n', required=False, type=int,
	metavar='<int>',
	help='number of fid datasets to collect from artina set')
parser.add_argument('--ids', '-i', nargs='+', required=False,
	type=str, metavar='<str>',
	help='provide a list of bmrb ids to collect fid datasets from')


arg = parser.parse_args()


# check args
if arg.bmrb:
	assert(os.path.isdir(
		os.path.abspath(arg.bmrb)))
	if arg.artinaset:
		assert(os.path.isfile(
			os.path.abspath(arg.artinaset)))

	if arg.num: assert(arg.num > 0)


if arg.ids:
	for id in arg.ids:
		try:
			intid = int(id)
		except:
			print(f'{id} is not int')
			print('probably not a valid bmrb id')
			print('aborting')
			sys.exit()
	if arg.num:
		print('--num not necessary when using --ids')
		print('re-run without --num')
		print('aborting')
		sys.exit()

errs = []
if arg.ids:
	for bmrbid in arg.ids:
		bmrbid = 'bmr'+bmrbid
		if arg.bmrb:
			status, msg = get_data(
				bmrbid=bmrbid,
				bmrbpath=arg.bmrb,
				output=arg.output)

			if not status:
				if 'aborting' in msg:
					print(msg)
					sys.exit()
				else: errs.append(msg)
		else:
			pass
			# wget bmrb
else:
	with open(arg.artinaset, 'r') as fp:
		entries = json.load(fp)

	count = 0
	for entry in entries:
		bmrb = entry['BMRB code']
		try:
			id = 'bmr'+bmrb.split(',')[0]
		except:
			continue

		status, msg = get_data(
			bmrbid=id,
			bmrbpath=arg.bmrb,
			output=arg.output)

		if not status:
			if 'aborting' in msg:
				print(msg)
				sys.exit()
			else: errs.append(msg)


				#sys.exit()
		count += 1
		if count == arg.num: break

if len(errs) > 0: print(json.dumps(errs,indent=2))

"""
iterate down each subdirectory of timedomain
detect if has procpar or pulprog
if procpar -- seqfil
i cant remember the other way for bruker, specdb knows
os.system()
cmd = cp ./* outputdir/id/seqfil
make a github
git clone
git add data/*
git commit -m "all data added"
git push
do only nesg
ask for a limit
default is 10

summary.json
pulse sequence: [paths]

readme
no processed data, need to do that separately
to get more -- the bmrb can be downloaded separetely -- can take a full day --
but not no more
wget ... ?


"""

	import argparse
	import json
	import os
	import re
	import sys


	bmrb_addr = 'https://bmrb.io/ftp/pub/bmrb/entry_directories/bmr'


	def bruker_acqus(acqus=None):
	assert(acqus != None)

	if not os.path.isfile(os.path.abspath(acqus)): return False

	with open(acqus, 'r') as fp:
	for line in fp.readlines():
	line = line.rstrip()
	if '##$PULPROG= ' in line:
	info = line.split('= ')
	pulse = info[-1]
	pulse = pulse.replace('<','')
	pulse = pulse.replace('>','')

	return pulse
	return False


	def varian_procpar(procpar=None):
	assert(procpar != None)

	if not os.path.isfile(os.path.abspath(procpar)): return False

	with open(procpar, 'r') as fp:
	seqfil = False
	for line in fp.readlines():
	line = line.rstrip()
	if re.search(r'^seqfil ', line):
	seqfil = True
	continue

	if seqfil:
	info = line.split()
	pulse = info[1]
	pulse = pulse.replace('"','')

	return pulse

	return False


	def get_data(bmrbid=None, bmrbpath=None, output=None):
	data_path = os.path.join(
	os.path.abspath(bmrbpath), f'{bmrbid}', 'timedomain_data')
	if not os.path.isdir(data_path):
	return False, f'bmrb id {bmrbid} does not have timedomain_data'

	out = os.path.join(
	os.path.abspath(output),
	f'{bmrbid}')
	cmd = f'mkdir -p {out}'
	os.system(cmd)

	for root, dirs, files in os.walk(data_path):
	if 'fid' in files or 'ser' in files:

	if 'procpar' in files:
	pulse = varian_procpar(procpar=os.path.join(root,'procpar'))
	if not pulse: return False, f'no seqfil?\naborting'
	elif 'acqus' in files:
	pulse = bruker_acqus(acqus=os.path.join(root,'acqus'))
	if not pulse: return False, f'no pulprog?\naborting'
	else:
	return False, ''.join(
	f'no acquistion/param files?\npath: {root}\naborting')

	#if 'noesy' in pulse: continue

	source_folder = root.split('timedomain_data')[-1]
	source_folder = source_folder.replace('/','_')
	#source_folder = f'{pulse}_{source_folder}.tar.gz'
	exp_dir = os.path.join(
	out,
	f'{pulse}_{source_folder}')
	cmd = f'mkdir -p {exp_dir}'
	os.system(cmd)

	cmd = f'cp {root}/* {exp_dir} 2>/dev/null'
	#cmd = f'tar -czf {out}/{source_folder} --no-recursion {root}/*'
	os.system(cmd)

	if 'fid' in files:
	cmd = f'tar -czf - {exp_dir}/fid \| split -b49M - {exp_dir}/sfid.'
	else:
	cmd = f'tar -czf - {exp_dir}/ser \| split -b49M - {exp_dir}/sser.'

	os.system(cmd)
	cmd = f'rm {exp_dir}/fid'
	os.system(cmd)
	cmd = f'rm {exp_dir}/ser'
	os.system(cmd)
	continue

	return True, True


	parser = argparse.ArgumentParser(description='Collect FIDs')
	parser.add_argument('--artinaset', '-a', required=False, type=str,
	metavar='<str>',
	help='path to artina set meta data, JSON file included with repository')
	parser.add_argument('--bmrb', '-b', required=False, type=str,
	metavar='<str>', help='path to BMRB data rsync directory')
	parser.add_argument('--output', '-o', required=True, type=str,
	metavar='<str>', help='path to where to save fid datasets collected')
	parser.add_argument('--num', '-n', required=False, type=int,
	metavar='<int>',
	help='number of fid datasets to collect from artina set')
	parser.add_argument('--ids', '-i', nargs='+', required=False,
	type=str, metavar='<str>',
	help='provide a list of bmrb ids to collect fid datasets from')


	arg = parser.parse_args()


	# check args
	if arg.bmrb:
	assert(os.path.isdir(
	os.path.abspath(arg.bmrb)))
	if arg.artinaset:
	assert(os.path.isfile(
	os.path.abspath(arg.artinaset)))

	if arg.num: assert(arg.num > 0)


	if arg.ids:
	for id in arg.ids:
	try:
	intid = int(id)
	except:
	print(f'{id} is not int')
	print('probably not a valid bmrb id')
	print('aborting')
	sys.exit()
	if arg.num:
	print('--num not necessary when using --ids')
	print('re-run without --num')
	print('aborting')
	sys.exit()

	errs = []
	if arg.ids:
	for bmrbid in arg.ids:
	bmrbid = 'bmr'+bmrbid
	if arg.bmrb:
	status, msg = get_data(
	bmrbid=bmrbid,
	bmrbpath=arg.bmrb,
	output=arg.output)

	if not status:
	if 'aborting' in msg:
	print(msg)
	sys.exit()
	else: errs.append(msg)
	else:
	pass
	# wget bmrb
	else:
	with open(arg.artinaset, 'r') as fp:
	entries = json.load(fp)

	count = 0
	for entry in entries:
	bmrb = entry['BMRB code']
	try:
	id = 'bmr'+bmrb.split(',')[0]
	except:
	continue

	status, msg = get_data(
	bmrbid=id,
	bmrbpath=arg.bmrb,
	output=arg.output)

	if not status:
	if 'aborting' in msg:
	print(msg)
	sys.exit()
	else: errs.append(msg)


	#sys.exit()
	count += 1
	if count == arg.num: break

	if len(errs) > 0: print(json.dumps(errs,indent=2))

	"""
	iterate down each subdirectory of timedomain
	detect if has procpar or pulprog
	if procpar -- seqfil
	i cant remember the other way for bruker, specdb knows
	os.system()
	cmd = cp ./* outputdir/id/seqfil
	make a github
	git clone
	git add data/*
	git commit -m "all data added"
	git push
	do only nesg
	ask for a limit
	default is 10

	summary.json
	pulse sequence: [paths]

	readme
	no processed data, need to do that separately
	to get more -- the bmrb can be downloaded separetely -- can take a full day --
	but not no more
	wget ... ?


	"""