Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

project.org 121 KB

You have to be logged in to leave a comment. Sign In
-*- mode: org; -*- * Meta #+TITLE: wepy.lysozyme_test #+AUTHOR: Samuel D. Lotz #+EMAIL: samuel.lotz@salotz.info #+STARTUP: overview inlineimages #+TODO: TODO | INPROGRESS WAIT | DONE CANCELLED * Setup ** Environments *** common **** Pip #+begin_src fundamental :tangle configs/common.requirements.txt # interaction ipython pdbpp # standard numpy pandas scipy sqlalchemy matplotlib joblib click networkx tabulate joblib pint multiprocessing_logging # this project -e . # my libs git+https://github.com/ADicksonLab/wepy.git git+https://github.com/ADicksonLab/geomm.git git+https://github.com/ADicksonLab/CSNAnalysis git+https://github.com/salotz/slurmify pint #+end_src **** Conda #+begin_src fundamental :tangle configs/common.env.yaml name: wepy.lysozyme_test.project channels: - conda-forge - defaults - omnia dependencies: - openmm - openmmtools #+end_src *** common_dev **** Pip #+begin_src fundamental :tangle configs/common_dev.requirements.txt # interaction ipython pdbpp # tools # currently this way doesn't work. Conda does. But I've had more success with the debian install #dvc[all] # standard numpy pandas scipy sqlalchemy matplotlib joblib click networkx tabulate joblib pint multiprocessing_logging # this project -e . # my libs -e ../../devel/wepy -e ../../devel/geomm -e ../../devel/CSNAnalysis #+end_src **** Conda #+begin_src fundamental :tangle configs/common_dev.env.yaml name: wepy.lysozyme_test.common_dev channels: - conda-forge - defaults - omnia dependencies: - openmm - openmmtools #+end_src *** sims **** Pip #+begin_src fundamental :tangle configs/sims.requirements.txt -e . multiprocessing_logging # my libs git+https://github.com/ADicksonLab/wepy.git git+https://github.com/ADicksonLab/geomm.git #+end_src **** Conda #+begin_src fundamental :tangle configs/sims.env.yaml name: wepy.lysozyme_test.project channels: - conda-forge - defaults - omnia/label/cuda100 - omnia dependencies: - openmm - openmmtools #+end_src *** sims_dev **** Pip #+begin_src fundamental :tangle configs/sims_dev.requirements.txt -e . multiprocessing_logging # my libs -e ../../devel/wepy git+https://github.com/ADicksonLab/geomm.git #+end_src **** Conda #+begin_src fundamental :tangle configs/sims_dev.env.yaml name: wepy.lysozyme_test.sims_dev channels: - conda-forge - defaults - omnia/label/cuda100 - omnia dependencies: - openmm - openmmtools #+end_src * Scratch For scratch work. Ephemeral. ** <2019-12-04 Wed> #+begin_src python from wepy_lysozyme_test._tasks import * from wepy.hdf5 import WepyHDF5 get_job_hdf5_path('49952117') h5_path = get_job_hdf5_path('49952117') wepy_h5 = WepyHDF5(h5_path, mode='r') wepy_h5.open() #+end_src ** <2019-12-05 Thu> #+begin_src python from wepy.boundary_conditions.receptor import UnbindingBC from wepy_lysozyme_test._tasks import * contig = get_gexp_span_contig('No', 0) layout_graph = contig.resampling_tree_layout_graph( bc_class=UnbindingBC, progress_key='min_distances' ) #+end_src ** <2019-12-06 Fri> #+begin_src python from wepy_lysozyme_test._tasks import * save_warp_lineages_dcd('No', 0) #+end_src ** <2019-12-09 Mon> #+begin_src python from wepy_lysozyme_test._tasks import * jobs_df = get_jobs_df() spans_df = copy(jobs_df).drop(columns=['success', 'status', 'jobid',]) spans_df = spans_df.groupby(['gexp', 'span_id']).count().reset_index().rename( columns={'segment_idx' : 'n_segments'}) contig0 = get_gexp_span_contig('No', 0) contig1 = get_gexp_span_contig('No', 1) #+end_src ** <2019-12-10 Tue> Rate plots #+begin_src python from wepy_lysozyme_test._tasks import * gexp_plot_rates_rt('WExplore') #+end_src * Configurations The place to put all configuration type data that will go into parameterizing other productions and analyses. * Workflow A (mostly) ordered outline of how to proceed from beginning to end. ** HPCC Interactive Jobs #+begin_src bash srun -N 1 -c 1 --mem=15G --time=48:00:00 -C intel16 --gres=gpu:1 --pty "/bin/bash" #+end_src ** Standalone Simulations To validate that our system is okay we do a couple of standalone runs on the system. *** Script Here is a script for running a basic kind of simulation: #+begin_src python :tangle scripts/run.py import simtk.openmm as omm import simtk.openmm.app as omma import simtk.unit as unit import mdtraj as mdj from openmmtools import testsystems # from seh_prep.modules import StateDataFrameOMMReporter STEP_TIME = 0.002 * unit.picoseconds LANGEVIN_DEFAULTS = ( 300.0*unit.kelvin, 1/unit.picosecond, STEP_TIME ) PLATFORM = 'CPU' FRAME_INTERVAL_TIME = 10 * unit.picosecond FRAME_INTERVAL = int(round(FRAME_INTERVAL_TIME / STEP_TIME)) def gen_sim(platform): integrator = omm.LangevinIntegrator(*LANGEVIN_DEFAULTS) test_sys = testsystems.LysozymeImplicit() # write a PDB file for reference top = mdj.Topology.from_openmm(test_sys.topology) traj = mdj.Trajectory(test_sys.positions.value_in_unit(unit.nanometer), topology=top) traj.save_pdb('ref.pdb') if platform == 'CUDA': platform_kwargs = {'Precision' : 'single'} elif platform == 'OpenCL': platform_kwargs = {"Precision" : 'single'} elif platform == 'CPU': platform_kwargs = {} elif platform == 'Reference': platform_kwargs = {} else: raise ValueError platform = omm.Platform.getPlatformByName(platform) simulation = omma.Simulation(test_sys.topology, test_sys.system, integrator, platform=platform, platformProperties=platform_kwargs) simulation.context.setPositions(test_sys.positions) # add some reporters simulation.reporters.append(omma.DCDReporter('traj.dcd', reportInterval=FRAME_INTERVAL, enforcePeriodicBox=False)) # simulation.reporters.append(StateDataFrameOMMReporter(report_interval=10)) simulation.reporters.append(omma.StateDataReporter('state.csv', FRAME_INTERVAL, step=True, time=True, potentialEnergy=True, kineticEnergy=True, totalEnergy=True, temperature=True, volume=True, )) return simulation if __name__ == "__main__": import sys import time # time in sampling to run sim_time = float(sys.argv[1]) #* unit.second platform = sys.argv[2] print("Running simulation for {}".format(sim_time)) tranche_time = 10 * unit.picosecond tranche_steps = int(round(tranche_time / STEP_TIME)) print(f"running tranches of {tranche_steps} steps") simulation = gen_sim(platform) print("Starting the simulation") curr_time = 0. step_counter = 0 start_time = time.time() while curr_time < sim_time: print("starting from step {}".format(step_counter)) print(type(tranche_steps)) simulation.step(tranche_steps) curr_time = time.time() - start_time step_counter += tranche_steps print("Ran {}".format(step_counter)) print("Time currently is: {}".format(curr_time)) # add up the sampling time sampling_time = step_counter * STEP_TIME print(f"Ran a total of {step_counter} steps") print(f"Ran a total of {sampling_time}") #+end_src *** run settings #+BEGIN_SRC toml :tangle hpcc/standalone/run_settings.toml # test run value walltime = "24:00:00" memory = "20gb" num_cpus = 2 # test run value num_gpus = 1 constraint = "[intel18|intel16]" #+END_SRC *** context settings #+BEGIN_SRC toml :tangle hpcc/standalone/context_settings.toml [context] # its okay to leave out the exec dir, but you can specify this and the #run will get executed here and then be moved back # exec_dir = "$SCRATCH/exec" # epilog is something that can be run after the run, if it is empty # here that is okay, this is really only important for moving data # back after being executed somewhere else epilog = "" # this section is for the setup of the run in the script [context.setup] # this is the parent dir for the jobs, there should be a directory # here called 'jobs' and each job will will get it's own unique # directory goal_dir_path = "$SCRATCH/tree/lab/projects/wepy.lysozyme_test/hpcc/standalone" # will load these modules from lmod i.e. 'module load ' lmod_modules = ["CUDA/10.0.130"] # everything in here is translated directly into environment # variables, e.g. "export ANACONDA_DIR=path/to/anaconda" [context.setup.env_vars] ANACONDA_DIR = "/mnt/home/lotzsamu/anaconda3" #+END_SRC *** Tasks **** 1 #+begin_src bash :shebang "#!/bin/bash --login" :tangle hpcc/standalone/tasks/initial.sh source $HOME/.bashrc # use the ANACONDA_DIR which is set in the environment variables in # the setup to setup this shell for anaconda . ${ANACONDA_DIR}/etc/profile.d/conda.sh env="wepy.lysozyme_test.sims" # activate the proper virtualenv conda activate $env # this will run for under 24 hours script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/run.py" ${ANACONDA_DIR}/envs/$env/bin/python $script 84000 CUDA #+end_src *** Retrieving results #+begin_src bash rsync -ravvhhiz \ lotzsamu@rsync.hpcc.msu.edu:/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/standalone/jobs/ \ $HOME/tree/lab/projects/wepy.lysozyme_test/hpcc/standalone/jobs #+end_src ** Test Wepy Simulations Run very short local simulations for debugging. *** Run a very short simulation to get some data #+begin_src bash ./scripts/noresampler_local_test.sh #+end_src **** Script #+begin_src bash :shebang "#!/bin/bash" :tangle scripts/noresampler_local_test.sh # use the ANACONDA_DIR which is set in the environment variables in # the setup to setup this shell for anaconda . ${ANACONDA_DIR}/etc/profile.d/conda.sh env='wepy.lysozyme_test.sims_dev' home_root="$HOME" # activate the proper virtualenv conda activate $env # test our openmm installation ${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation # get into the right dir and copy the inputs etc. rm -rf tmp/test_wepy_sim mkdir -p tmp/test_wepy_sim cd tmp/test_wepy_sim # run the simulation N_CYCLES=10 N_STEPS=10 N_WALKERS=5 N_WORKERS=1 PLATFORM="CPU" RESAMPLER="REVOResampler" script="$home_root/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_cycles.py" ${ANACONDA_DIR}/envs/$env/bin/python $script \ $N_CYCLES \ $N_STEPS \ $N_WALKERS \ $N_WORKERS \ $PLATFORM \ $RESAMPLER #+end_src ** Wepy Simulations To validate that our system is okay we do a couple of standalone runs on the system. *** By Time Here is a script for running a basic kind of simulation: #+begin_src python :tangle scripts/wepy_run_by_time.py if __name__ == "__main__": import sys import logging from multiprocessing_logging import install_mp_handler from wepy_tools.sim_makers.openmm.lysozyme import LysozymeImplicitOpenMMSimMaker logging.getLogger().setLevel(logging.DEBUG) install_mp_handler() if sys.argv[1] == "-h" or sys.argv[1] == "--help": print("arguments: n_cycles, n_steps, n_walkers, n_workers, platform, resampler") else: walltime = int(sys.argv[1]) #n_cycles = int(sys.argv[1]) n_steps = int(sys.argv[2]) n_walkers = int(sys.argv[3]) n_workers = int(sys.argv[4]) platform = sys.argv[5] resampler = sys.argv[6] print("Number of steps: {}".format(n_steps)) print("Time to run: {}".format(walltime)) # print("Number of cycles: {}".format(n_cycles)) sim_maker = LysozymeImplicitOpenMMSimMaker() apparatus = sim_maker.make_apparatus( integrator='LangevinIntegrator', resampler=resampler, bc='UnbindingBC', platform=platform, ) config = sim_maker.make_configuration(apparatus, work_mapper='TaskMapper', platform=platform) sim_manager = sim_maker.make_sim_manager(n_walkers, apparatus, config) sim_manager.run_simulation_by_time(walltime, n_steps, num_workers=n_workers) #+end_src *** By Num Cycles Here is a script for running a basic kind of simulation: #+begin_src python :tangle scripts/wepy_run_by_cycles.py if __name__ == "__main__": import sys import logging from multiprocessing_logging import install_mp_handler from wepy_tools.sim_makers.openmm.lysozyme import LysozymeImplicitOpenMMSimMaker logging.getLogger().setLevel(logging.DEBUG) install_mp_handler() if sys.argv[1] == "-h" or sys.argv[1] == "--help": print("arguments: n_cycles, n_steps, n_walkers, n_workers, platform, resampler") else: n_cycles = int(sys.argv[1]) n_steps = int(sys.argv[2]) n_walkers = int(sys.argv[3]) n_workers = int(sys.argv[4]) platform = sys.argv[5] resampler = sys.argv[6] print("Number of steps: {}".format(n_steps)) print("Number of cycles: {}".format(n_cycles)) sim_maker = LysozymeImplicitOpenMMSimMaker() apparatus = sim_maker.make_apparatus( integrator='LangevinIntegrator', resampler=resampler, bc='UnbindingBC', platform=platform, ) config = sim_maker.make_configuration(apparatus, work_mapper='TaskMapper', platform=platform) sim_manager = sim_maker.make_sim_manager(n_walkers, apparatus, config) sim_manager.run_simulation_by_time(n_cycles, n_steps, num_workers=n_workers) #+end_src *** run settings **** Full GPU Node, Week #+BEGIN_SRC toml :tangle hpcc/wepy/configs/GPU-8_168-hrs.run_settings.toml # test run value walltime = "168:00:00" memory = "20gb" num_cpus = 4 num_gpus = 8 constraint = "[intel18|intel16]" #+END_SRC **** Single GPU Node, Week #+BEGIN_SRC toml :tangle hpcc/wepy/configs/GPU-1_168-hrs.run_settings.toml # test run value walltime = "168:00:00" memory = "10gb" num_cpus = 4 num_gpus = 1 constraint = "[intel18|intel16]" #+END_SRC *** context settings #+BEGIN_SRC toml :tangle hpcc/wepy/configs/context_settings.toml [context] # its okay to leave out the exec dir, but you can specify this and the #run will get executed here and then be moved back # exec_dir = "$SCRATCH/exec" # epilog is something that can be run after the run, if it is empty # here that is okay, this is really only important for moving data # back after being executed somewhere else epilog = "" # this section is for the setup of the run in the script [context.setup] # this is the parent dir for the jobs, there should be a directory # here called 'jobs' and each job will will get it's own unique # directory goal_dir_path = "$SCRATCH/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy" # will load these modules from lmod i.e. 'module load ' lmod_modules = ["CUDA/10.0.130"] # everything in here is translated directly into environment # variables, e.g. "export ANACONDA_DIR=path/to/anaconda" [context.setup.env_vars] ANACONDA_DIR = "/mnt/home/lotzsamu/anaconda3" #+END_SRC *** Tasks **** NoResampler single-walker #+begin_src bash :shebang "#!/bin/bash" :tangle hpcc/wepy/tasks/noresampler_single.sh # use the ANACONDA_DIR which is set in the environment variables in # the setup to setup this shell for anaconda . ${ANACONDA_DIR}/etc/profile.d/conda.sh env="wepy.lysozyme_test.sims" # activate the proper virtualenv conda activate $env ${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation WALLTIME=590400 N_STEPS=10000 # 20 picoseconds N_WALKERS=1 N_WORKERS=1 PLATFORM="CUDA" RESAMPLER="NoResampler" script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_time.py" ${ANACONDA_DIR}/envs/$env/bin/python $script \ $WALLTIME \ $N_STEPS \ $N_WALKERS \ $N_WORKERS \ $PLATFORM \ $RESAMPLER #+end_src **** NoResampler ensemble #+begin_src bash :shebang "#!/bin/bash" :tangle hpcc/wepy/tasks/noresampler_ensemble.sh # use the ANACONDA_DIR which is set in the environment variables in # the setup to setup this shell for anaconda . ${ANACONDA_DIR}/etc/profile.d/conda.sh env="wepy.lysozyme_test.sims" # activate the proper virtualenv conda activate $env ${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation WALLTIME=590400 N_STEPS=10000 # 20 picoseconds N_WALKERS=48 N_WORKERS=8 PLATFORM="CUDA" RESAMPLER="NoResampler" script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_time.py" ${ANACONDA_DIR}/envs/$env/bin/python $script \ $WALLTIME \ $N_STEPS \ $N_WALKERS \ $N_WORKERS \ $PLATFORM \ $RESAMPLER #+end_src **** WExplore #+begin_src bash :shebang "#!/bin/bash" :tangle hpcc/wepy/tasks/wexplore.sh # use the ANACONDA_DIR which is set in the environment variables in # the setup to setup this shell for anaconda . ${ANACONDA_DIR}/etc/profile.d/conda.sh env="wepy.lysozyme_test.sims" # activate the proper virtualenv conda activate $env ${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation WALLTIME=590400 N_STEPS=10000 # 20 picoseconds N_WALKERS=48 N_WORKERS=8 PLATFORM="CUDA" RESAMPLER="WExploreResampler" script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_time.py" ${ANACONDA_DIR}/envs/$env/bin/python $script \ $WALLTIME \ $N_STEPS \ $N_WALKERS \ $N_WORKERS \ $PLATFORM \ $RESAMPLER #+end_src **** REVO #+begin_src bash :shebang "#!/bin/bash" :tangle hpcc/wepy/tasks/revo.sh # use the ANACONDA_DIR which is set in the environment variables in # the setup to setup this shell for anaconda . ${ANACONDA_DIR}/etc/profile.d/conda.sh env="wepy.lysozyme_test.sims" # activate the proper virtualenv conda activate $env ${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation WALLTIME=590400 N_STEPS=10000 # 20 picoseconds N_WALKERS=48 N_WORKERS=8 PLATFORM="CUDA" RESAMPLER="REVOResampler" script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_time.py" ${ANACONDA_DIR}/envs/$env/bin/python $script \ $WALLTIME \ $N_STEPS \ $N_WALKERS \ $N_WORKERS \ $PLATFORM \ $RESAMPLER #+end_src *** Running the simulations Make sure to push the scripts to HPCC: #+begin_src bash inv custom.push-scripts #+end_src Generate the scripts on HPCC: #+begin_src bash inv custom.hpcc-gen-wepy-submissions #+end_src Then submit them into slurm: #+begin_src bash (cd hpcc/wepy && sbatch submissions/*) #+end_src *** Post-Simulation Then retrieve the results from HPCC. #+begin_src bash rsync -ravvhhiz \ lotzsamu@rsync.hpcc.msu.edu:/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs/ \ $HOME/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs #+end_src Pop the links out to resources should be done on HPCC first so they are both in sync, but this shows it being done locally. #+begin_src bash symlink-pop /home/salotz/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs \ /home/salotz/tree/lab/resources/project-resources/wepy.lysozyme_test/hpcc/wepy/jobs #+end_src Then back them up to volta Just doing the whole tree since that needs to be done anyways. #+begin_src bash inv refugue.rsync --source superior --target volta #+end_src Backup to allegheny #+begin_src bash inv refugue.rsync --source superior --target allegheny #+end_src ** Preprocessing *** Link Results by Experimental groups #+begin_src bash python -m wepy_lysozyme_test.execution.link_results #+end_src ** Analyze wepy simulations *** resampling tree #+begin_src bash python -m wepy_lysozyme_test.execution.gen_restrees 'WExplore' #+end_src #+begin_src bash python -m wepy_lysozyme_test.execution.gen_restrees 'REVO' #+end_src *** terminal lineage trajectories **** Warping Lineages To get them all in one trajectory file: #+begin_src bash (for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do python -m wepy_lysozyme_test.execution.gen_warp_lineages $gexp || exit 1 done) #+end_src *** Contig Stats: length, warping, rates #+begin_src bash python -m wepy_lysozyme_test.execution.gen_contig_stats_table #+end_src *** Rate Plots #+begin_src bash (for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do python -m wepy_lysozyme_test.execution.show_rate_plots $gexp || exit 1 done) #+end_src *** TODO compute observables *** TODO free energy plots * Analysis A structured approach to running analytics tasks in python. In the "Tasks" section you write pure functions to do stuff. In the workflows you can build up workflows of the tasks in Prefect. In the execution you write the "scripts" that actually run them. ** Tasks Define the units of work that you want done here. *** Initialization **** Header #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py """Generated file from the analysis.org file. Do not edit directly.""" #+END_SRC **** Imports Imported modules that will be available to all tasks #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py # standard library import os import os.path as osp import pickle from copy import copy, deepcopy import itertools as it from collections import defaultdict from pathlib import Path from operator import attrgetter # de facto standard library import numpy as np import pandas as pd import sqlalchemy as sqla import matplotlib.pyplot as plt import networkx as nx from tabulate import tabulate # extra non-domain specific import joblib import pint import simtk.unit as tkunit #+END_SRC **** Configuration #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py PROJECT_PATH = "$HOME/tree/lab/projects/wepy.lysozyme_test" #+end_src **** Paths #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py ## Paths # for localizing paths to very commonly used resources and resrouces # which may change schema. The directory structure for the rest is the # schema, so just use osp.join(project_path(), 'subpath/to/resource') # for the rest so a lot of work is reduced in specifying all of them def project_path(): return Path(osp.expanduser(osp.expandvars(PROJECT_PATH))) def data_path(): return project_path() / 'data' def db_path(): return project_path() / 'db' def media_path(): return project_path() / 'media' def scratch_path(): return project_path() / 'scratch' def scripts_path(): return project_path() / 'scripts' def src_path(): return project_path() / 'src' def tmp_path(): return project_path() / 'tmp' def troubleshoot_path(): return project_path() / 'troubleshoot' # specific things def sqlite_path(): return project_path() / 'db/db.sqlite' def joblib_cache_path(): return project_path() / 'cache/joblib' #+END_SRC **** Setup ***** Joblib Caching #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py ## Setup # set up the joblib cache jlmem = joblib.Memory(joblib_cache_path()) #+END_SRC ***** Recursion limit #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py RECURSION_LIMIT = 1000000 # set this when you want to do some recursion stuff with contigtrees def _set_recursion_limit(recursion_limit=RECURSION_LIMIT): import sys; sys.setrecursionlimit(recursion_limit) print("Setting recursion limit to {}".format(recursion_limit)) # set the recursion depth since it is always needing to be increased _set_recursion_limit() #+END_SRC ***** Units: pint #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py UNITS = pint.UnitRegistry() #+END_SRC ***** COMMENT SQlite **** Data: Read & Write These functions infer the type of file you want to write based on the file extension. #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py def init_dir(dir_path): dir_path.mkdir(exist_ok=True, parents=True) def load_obj(filepath): fname = osp.basename(filepath) # use the file extension for how to load it if fname.endswith('jl.pkl'): # it is a joblib object so use joblib to load it with open(filepath, 'rb') as rf: obj = joblib.load(rf) elif fname.endswith('pkl'): # it is a pickle object so use joblib to load it with open(filepath, 'rb') as rf: obj = pickle.load(rf) return obj def save_obj(obj_path, obj, overwrite=False, ext='jl.pkl'): import os import os.path as osp import pickle import joblib if ext == 'jl.pkl': pickler_dump = joblib.dump elif ext == 'pkl': pickler_dump = pickle.dump else: raise ValueError("Must choose an extension for format selection") # if we are not overwriting check if it exists if not overwrite: if osp.exists(obj_path): raise OSError("File exists ({}), not overwriting".format(obj_path)) # otherwise make sure the directory exists os.makedirs(osp.dirname(obj_path), exist_ok=True) # it is a joblib object so use joblib to load it with open(obj_path, 'wb') as wf: pickler_dump(obj, wf) def load_table(filepath): import os.path as osp import pandas as pd fname = osp.basename(filepath) # use the file extension for how to load it if fname.endswith('csv'): df = pd.read_csv(filepath, index_col=0) elif fname.endswith('pkl'): df = pd.read_pickle(filepath) else: raise ValueError("extension not supported") return df def save_table(table_path, df, overwrite=False, ext='csv'): import os import os.path as osp import pickle import pandas as pd # if we are not overwriting check if it exists if not overwrite: if osp.exists(table_path): raise OSError("File exists ({}), not overwriting".format(table_path)) # otherwise make sure the directory exists for this observable os.makedirs(osp.dirname(table_path), exist_ok=True) if ext == 'csv': df.to_csv(table_path) elif ext == 'pkl': df.to_pickle(table_path) else: raise ValueError("extension not supported") #+end_src *** Parameters The parameters and constants for the pipeline. #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py ## Parameters # number of individual steps per cycle in simulations NUM_CYCLE_STEPS = 10000 STEP_TIME = 2 * tkunit.femtosecond CYCLE_TIME = NUM_CYCLE_STEPS * STEP_TIME # we could go in and get these values from the sim maker itself, but # we'll keep it simple since this is the answer WE_PMIN = 1e-12 ## Visualization # Resampling trees PROGRESS_KEY = 'min_distances' MAX_PROGRESS_NORM = 1.0 DISCONTINUOUS_NODE_SHAPE = 'square' DEFAULT_NODE_SHAPE = 'disc' COLORMAP_NAME = 'plasma' #+END_SRC **** Experimental Constants #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py EXPERIMENTAL_MFPT_UNIT = tkunit.minute # TODO: in the paper the unit is written "x 10^{-4} s^{-1}" EXPERIMENTAL_RATE_UNIT = None EXPERIMENTAL_RATE = 950 * (1/tkunit.second) EXPERIMENTAL_MFPT = 1 / EXPERIMENTAL_RATE EXPERIMENTAL_RT = EXPERIMENTAL_MFPT #+END_SRC *** Data **** Reference System #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py def get_sim_maker(): from wepy_tools.sim_makers.openmm.lysozyme import LysozymeImplicitOpenMMSimMaker return LysozymeImplicitOpenMMSimMaker() def get_ref_state(): return get_sim_maker().init_state def get_distance_func(): return get_sim_maker().distance def get_ref_image(): maker = get_sim_maker() # apply the distance metric image function to the reference state return maker.distance.image(maker.init_state) def get_topology(): return get_sim_maker().json_top() #+end_src **** Selection Idxs and Tops #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py # just for reference all the keys we want to fill in SELECTION_KEYS = ('all_atoms/ligand', 'all_atoms/protein', 'all_atoms/binding_site', 'all_atoms/image', 'image/ligand', 'image/binding_site', 'image/protein', ) TOP_KEYS = ('all_atoms', 'image', 'ligand') @jlmem.cache def get_selection_idxs_tops(): """ compute both at once for convenience """ import numpy as np import mdtraj as mdj from wepy.util.json_top import json_top_subset from wepy.util.mdtraj import ( mdtraj_to_json_topology, json_to_mdtraj_topology, traj_fields_to_mdtraj ) # fill these in sel_tops = {} sel_idxs = {} ### Initialize sim_maker = get_sim_maker() BC = 'UnbindingBC' BC_PARAMS = sim_maker.DEFAULT_BC_PARAMS[BC] ligand_idxs = list(sim_maker.ligand_idxs()) protein_idxs = list(sim_maker.receptor_idxs()) binding_site_idxs = list(sim_maker.binding_site_idxs(BC_PARAMS['cutoff_distance'])) image_idxs = list(sim_maker.distance._image_idxs) # the initial state init_state = get_ref_state() init_box_vectors = init_state['box_vectors'] # * init_state.box_vectors_unit # the full topology all_atoms_top = get_topology() ### All atoms sel_tops['all_atoms'] = all_atoms_top # the all atoms subselections sel_idxs['all_atoms/ligand'] = ligand_idxs sel_idxs['all_atoms/protein'] = protein_idxs sel_idxs['all_atoms/binding_site'] = binding_site_idxs ### Image # get the topology for this image_top = json_top_subset(all_atoms_top, image_idxs) sel_tops['image'] = image_top sel_idxs['all_atoms/image'] = image_idxs sel_idxs['image/ligand'] = [image_idxs.index(i) for i in ligand_idxs if i in image_idxs] sel_idxs['image/protein'] = [image_idxs.index(i) for i in protein_idxs if i in image_idxs] sel_idxs['image/binding_site'] = sel_idxs['image/protein'] ### Ligand ligand_top = json_top_subset(all_atoms_top, sel_idxs['all_atoms/ligand']) sel_tops['ligand'] = ligand_top return sel_idxs, sel_tops def get_selection_idxs(): sels, tops = get_selection_idxs_tops() return sels def get_selection_tops(): sels, tops = get_selection_idxs_tops() return tops # make a dataframe summarizing the counts of different selections for # each ligand def selection_df(): from collections import defaultdict selection_df = defaultdict(list) selections = get_selection_idxs() for selection_key, idxs in selections.items(): selection_df[selection_key].append(len(idxs)) selection_df = pd.DataFrame(selection_df) return selection_df #+END_SRC **** Atom Selection Trajs #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py def get_ref_state_traj_fields(rep_key='main_rep'): import numpy as np selection_idxs = get_selection_idxs() tops = get_selection_tops() ref_state = get_ref_state() ref_state_traj_fields = {} if rep_key is None or rep_key == 'all_atoms': positions = ref_state['positions'] else: positions = ref_state['positions'][ selection_idxs['all_atoms/{}'.format(rep_key)]] ref_state_traj_fields['positions'] = np.array([positions]) ref_state_traj_fields['box_vectors'] = np.array([ref_state['box_vectors']]) return ref_state_traj_fields @jlmem.cache def ref_selection_trajs(): import numpy as np from wepy.util.mdtraj import traj_fields_to_mdtraj selection_idxs = get_selection_idxs() tops = get_selection_tops() ref_state = get_ref_state() ref_state_fields = {key : np.array([value]) for key, value in ref_state.dict().items()} # make fields for the different representations ref_state_fields['image'] = ref_state['positions'][selection_idxs['all_atoms/image']] ref_state_fields['ligand'] = ref_state['positions'][selection_idxs['all_atoms/ligand']] # all atoms all_atoms_traj = traj_fields_to_mdtraj(ref_state_fields, tops['all_atoms'], rep_key='positions') # image image_traj = traj_fields_to_mdtraj(ref_state_fields, tops['image'], rep_key='image') # ligand ligand_traj = traj_fields_to_mdtraj(ref_state_fields, tops['ligand'], rep_key='ligand') ref_selections = {} ref_selections['all_atoms'] = all_atoms_traj ref_selections['image'] = image_traj ref_selections['ligand'] = ligand_traj return ref_selections #+END_SRC **** Experimental Groups and Jobs - gexp :: short for "experimental group", and perhaps a pun on "experimental-group expression" since we organize each experimental group by a singular structured name The experimental conditions for each simulation are specified here with reference to the job id for each of them. #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py EXPERIMENTAL_GROUPS = ('No', 'No_single', 'REVO', 'WExplore',) # the parameters used for each test group from wepy.resampling.resamplers.resampler import NoResampler from wepy.resampling.resamplers.revo import REVOResampler from wepy.resampling.resamplers.wexplore import WExploreResampler from wepy.resampling.decisions.decision import NoDecision from wepy.resampling.decisions.clone_merge import MultiCloneMergeDecision GEXP_METADATA = { 'No' : { 'resampler' : NoResampler, 'decision' : NoDecision, 'num_walkers' : 48, }, 'No_single' : { 'resampler' : NoResampler, 'decision' : NoDecision, 'num_walkers' : 1, }, 'REVO' : { 'resampler' : REVOResampler, 'decision' : MultiCloneMergeDecision, 'num_walkers' : 48, }, 'WExplore' : { 'resampler' : WExploreResampler, 'decision' : MultiCloneMergeDecision, 'num_walkers' : 48, }, } def get_jobs_df(): jobs_path = data_path() / 'sim_management/jobs.csv' jobs_df = pd.read_csv(jobs_path) return jobs_df def get_gexp_jobs(): jobs_df = get_jobs_df() gexp_jobs = {} for gexp, grp_df in jobs_df.groupby('gexp'): gexp_jobs[gexp] = list(grp_df['jobid'].values) return gexp_jobs def get_gexp_paths(): """Get a dictionary of all of the paths for the jobs dirs for each experimental group.""" gexp_jobs = get_gexp_jobs() sims_dir = data_path() / 'sims/wepy/jobs' gexp_paths = defaultdict(list) for grp_name, jobids in gexp_jobs.items(): for job_id in jobids: path = sims_dir / str(job_id) gexp_paths[grp_name].append(path) return gexp_paths #+end_src **** WepyHDF5s Raw access to WepyHDF5s for each experimental group and the paths to the sample files. #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py def get_job_hdf5_path(jobid): return data_path() / f'sims/wepy/jobs/{jobid}/root.wepy.h5' def get_linker_hdf5_path(gexp): return data_path() / f'results/{gexp}.wepy.h5' def link_gexp_hdf5s(gexp): from wepy.hdf5 import WepyHDF5 jobids = get_gexp_jobs()[gexp] h5_paths = [get_job_hdf5_path(jobid) for jobid in jobids] # get a template wepy_hdf5 file template_h5 = WepyHDF5(h5_paths[0], mode='r') # make the linker file linker_h5_path = get_linker_hdf5_path(gexp) with template_h5: linker_h5 = template_h5.clone(linker_h5_path, mode='w') # then link all the job files to it with linker_h5: job_run_idxs = {} for jobid, h5_path in zip(jobids, h5_paths): # link the run from the file new_run_idx = linker_h5.link_run(h5_path, 0) job_run_idxs[jobid] = new_run_idx # set the jobid as a run attr linker_h5.run(new_run_idx).attrs['jobid'] = jobid return linker_h5_path def get_gexp_wepy_h5(gexp): from wepy.hdf5 import WepyHDF5 path = get_linker_hdf5_path(gexp) return WepyHDF5(path, mode='r') def get_all_wepy_h5s(): d = {} for gexp in EXPERIMENTAL_GROUPS: d[gexp] = get_gexp_wepy_h5(gexp) return d def get_gexp_jobs_to_runs_map(gexp): """Get the runs in the HDF5 for the gexp that are mapped to by the jobids.""" wepy_h5 = get_gexp_wepy_h5(gexp) jobs_runs_d = {} with wepy_h5: for run_idx in wepy_h5.run_idxs: run_grp = wepy_h5.run(run_idx) jobid = run_grp.attrs['jobid'] run_idx = run_grp.attrs['run_idx'] jobs_runs_d[jobid] = run_idx return jobs_runs_d def get_gexp_span_ids(gexp): jobs_df = get_jobs_df() gexp_jobs_df = jobs_df[jobs_df['gexp'] == gexp] span_ids = sorted(list(set(gexp_jobs_df['span_id'].values))) return span_ids def get_gexp_span_ids_run_idxs(gexp): """Get a mapping of the span_ids (names given to them) to the run idxs (in order of the segments in the span via 'segment_idxs'). The span_ids and segment_idxs come from the jobs_df table. """ jobs_df = get_jobs_df() gexp_jobs_df = jobs_df[jobs_df['gexp'] == gexp] # get the mapping of jobids -> h5 run idxs jobs_runs_d = get_gexp_jobs_to_runs_map(gexp) # then go through each span and get the run idxs for it spans_runs = {} for span_id, span_df in gexp_jobs_df.groupby('span_id'): # make records for each segment so that we can sort them span_segs = [] for idx, row in span_df.iterrows(): seg_rec = (row['segment_idx'], row['jobid']) span_segs.append(seg_rec) # sort them, dereference the run_idx span_run_idxs = [jobs_runs_d[jobid] for seg_idx, jobid in sorted(span_segs)] spans_runs[span_id] = span_run_idxs return spans_runs #+END_SRC **** Contig Trees #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py # the base contigtree is the one that can be cached not the one with # the HDF5 file in it, so we cache this, you can use the other # function to get the contigtree with the HDF5, but the openening of # file handles is annoying and this might be the best way to do this # anyhow @jlmem.cache def get_base_contigtree(gexp): from wepy.analysis.contig_tree import BaseContigTree from wepy.boundary_conditions.receptor import UnbindingBC wepy_h5 = get_gexp_wepy_h5(gexp) # get the decision class for this gexp decision = GEXP_METADATA[gexp]['decision'] _set_recursion_limit() base_contigtree = BaseContigTree(wepy_h5, decision_class=decision, boundary_condition_class=UnbindingBC) return base_contigtree def get_contigtree(gexp): from wepy.analysis.contig_tree import ContigTree wepy_h5 = get_gexp_wepy_h5(gexp) base_contigtree = get_base_contigtree(gexp) return ContigTree(wepy_h5, base_contigtree=base_contigtree) def all_contigtrees(): d = {} for gexp in EXPERIMENTAL_GROUPS: d[gexp] = get_contigtree(gexp) return d @jlmem.cache def get_gexp_span_ids_span_idxs_map(gexp): """Get a mapping of the span_ids to the span_idxs in the contigtree.""" contigtree = get_base_contigtree(gexp) # get the first run idx -> span_id map run_idxs_to_span_ids = {run_idxs[0] : span_id for span_id, run_idxs in get_gexp_span_ids_run_idxs(gexp).items()} span_ids_to_span_idxs_d = {} for span_idx, trace in contigtree.span_traces.items(): run_idx = trace[0][0] span_id = run_idxs_to_span_ids[run_idx] span_ids_to_span_idxs_d[span_id] = span_idx return span_ids_to_span_idxs_d ## Contigs def get_gexp_span_contig(gexp, span_id): # get the span idx in the contigtree from the span_id in the jobs # table span_ids_to_span_idxs_d = get_gexp_span_ids_span_idxs_map(gexp) span_idx = span_ids_to_span_idxs_d[span_id] # get the contig for this span contigtree = get_contigtree(gexp) contig = contigtree.span_contig(span_idx) return contig #+END_SRC **** Warping #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py @jlmem.cache def get_gexp_span_warps(gexp, span_id): contig = get_gexp_span_contig(gexp, span_id) warp_df = contig.warping_records_dataframe() return warp_df #+END_SRC **** Parent Table and Forest #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py @jlmem.cache def get_gexp_span_parent_forest(gexp, span_id): contig = get_gexp_span_contig(gexp, span_id) parent_forest = ParentForest(contig=contig) return parent_forest @jlmem.cache def get_gexp_span_parent_table(gexp, span_id, discontinuities=False): contig = get_gexp_span_contig(gexp, span_id) parent_table = contig.parent_table(discontinuities=discontinuities) return parent_table #+END_SRC **** TODO Macro-State Networks #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py @jlmem.cache def get_state_network(gexp, observable, lag_time): # clear cache from wepy.analysis.network import MacroStateNetwork contigtree = get_contigtree(gexp) cluster_key = "observables/{}".format(observable) net = MacroStateNetwork(contigtree, transition_lag_time=lag_time, assg_field_key=cluster_key) # set the macrostate weights net.set_macrostate_weights() # set the index of the cluster centers for each macrostate, both # the feature index (from the raw vector of features) and the # trace-like index (run, traj, frame) clf = get_clustering_model(observable, gexp) # the feature vector index (the concatenated features) net.set_nodes_attribute('center_feature_idx', {node_idx : feature_idx for node_idx, feature_idx in enumerate(clf.cluster_ids_)}) # get the run trace index of each cluster center net.set_nodes_attribute('center_idx', {node_idx : trace_idx for node_idx, trace_idx in enumerate(cluster_center_run_trace_idxs(observable, gexp))}) return net.base_network #+END_SRC *** Generating Files **** Write Reference Topology PDBs Write out topology PDBs for all the representations of each ligand. These should be from the initial walkers so we can also use them as reference states. #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py def write_ref_selection_pdbs(): import os import os.path as osp ref_selections = ref_selection_trajs() top_dir = data_path() / 'top' # make sure the directory exists os.makedirs(top_dir, exist_ok=True) for sel_name, sel_ref_traj in ref_selections.items(): pdb_path = top_dir / "{}_ref.pdb".format(sel_name) sel_ref_traj.save_pdb(pdb_path) #+END_SRC **** Write restree ***** GEXF #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py def save_restree_gexf(gexp, span_id): restree_path = data_path() / f'restree/gexp-{gexp}/span-{span_id}.restree.gexf' restree_path.parent.mkdir(exist_ok=True, parents=True) layout_graph = get_restree_graph(gexp, span_id) nx.write_gexf(layout_graph.viz_graph, restree_path) #+end_src **** Warping lineages #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py def save_warp_lineages_dcds(gexp, span_id): lineages_dir_path = data_path() / f'warp-lineages/gexp-{gexp}/span-{span_id}_warp-lineages' lineages_dir_path.mkdir(exist_ok=True, parents=True) lineage_traces = get_warp_lineage_traces(gexp, span_id) contig = get_gexp_span_contig(gexp, span_id) total_frames = 0 with contig: for warp_idx, lineage_trace in enumerate(lineage_traces): total_frames += len(lineage_trace) print(f"warp: {warp_idx} has {len(lineage_trace)} frames") path = lineages_dir_path / f'warp-{warp_idx}.dcd' print("loading ") traj = contig.wepy_h5.traj_fields_to_mdtraj( contig.wepy_h5.get_trace_fields(lineage_trace, ['positions', 'box_vectors'] )) print("saving") traj.save_dcd(str(path)) print(f"total frames: {total_frames}") def save_warp_lineages_dcd(gexp, span_id): lineages_path = data_path() / f'warp-lineages/gexp-{gexp}/span-{span_id}_warp-lineages.dcd' lineages_path.parent.mkdir(exist_ok=True, parents=True) lineages_trace = it.chain(*get_warp_lineage_traces(gexp, span_id)) print(f"total frames: {len(lineage_trace)}") contig = get_gexp_span_contig(gexp, span_id) print("Loading") with contig: traj = contig.wepy_h5.traj_fields_to_mdtraj( contig.wepy_h5.get_trace_fields(lineages_trace, ['positions', 'box_vectors'] )) print("saving DCD") traj.save_dcd(str(lineages_path)) #+end_src **** Span Stats #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py def save_span_stats_table(overwrite=True): table_path = data_path() / f'span-stats/span-stats.csv' spans_df = get_span_stats_df() save_table(table_path, spans_df, overwrite=overwrite) #+end_src *** Utilities **** TODO Stack Toplogies Make a topology which has multiple identical chains in it. **** GEXF #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py def clean_gephi_gexf(gexf): from xmltodict import parse, unparse # parse the bad gexf file bad_gexf = parse(gexf) # the good tag we take things from good_gexf_tag = """ """ # parse the good tag and get the values tag_attrs = {} for key, value in parse(good_gexf_tag)['gexf'].items(): if not key.startswith('@'): continue else: tag_attrs[key] = value # read the bad xml and parse it # then replace the tag attributes for this with the good ones for key, value in tag_attrs.items(): bad_gexf['gexf'][key] = value # then get the good xml string return unparse(bad_gexf) #+END_SRC *** Exploring Simulations **** Resampling Trees #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py @jlmem.cache def get_restree_graph(gexp, span_id, progress_key=PROGRESS_KEY): # clear cache from wepy.boundary_conditions.receptor import UnbindingBC contig = get_gexp_span_contig(gexp, span_id) layout_graph = contig.resampling_tree_layout_graph( bc_class=UnbindingBC, progress_key=PROGRESS_KEY, ) return layout_graph #+end_src **** Exit Point trajectories #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py #@jlmem.cache def get_warp_lineage_traces(gexp, span_id): contig = get_gexp_span_contig(gexp, span_id) with contig: warp_trace = contig.warp_contig_trace() lineage_traces = contig.lineages(warp_trace) return lineage_traces def get_warp_lineage_trajs(gexp, span_id): lineage_traces = get_warp_lineage_traces(gexp, span_id) with contig: trajs = [] for warp_idx, lineage_trace in enumerate(lineage_traces): traj = contig.wepy_h5.traj_fields_to_mdtraj( contig.wepy_h5.get_trace_fields(lineage_trace, ['positions', 'box_vectors'] )) trajs.append(traj) return traj #+end_src **** Span Stats #+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py SPANS_TABLE_COLS = ( 'gexp', 'span_id', 'n_runs', 'n_warps', 'n_cycles', 'total_sampling_time_ps', # picoseconds 'warps_per_ps', # picoseconds 'rate_ps', # picoseconds 'total_warped_weight', 'warp_weight_mean', 'warp_weight_median', 'warp_weight_min', 'warp_weight_max', 'warp_wait-time_mean_ps', 'warp_wait-time_median_ps', 'warp_wait-time_min_ps', 'warp_wait-time_max_ps', ) @jlmem.cache def get_span_stats_df(): from wepy.analysis.rates import contig_warp_rates jobs_df = get_jobs_df() spans_df = copy(jobs_df).drop(columns=['success', 'status', 'jobid',]) # count the number of segments in the jobs spans_df = spans_df.groupby(['gexp', 'span_id']).count().reset_index().rename( columns={'segment_idx' : 'n_runs'}) # get the number of cycles and warps in the span by reading the # contig new_cols = { 'n_warps' : [], 'n_cycles' : [], 'total_sampling_time_ps' : [], 'warps_per_ps' : [], 'rate_ps' : [], 'total_warped_weight' : [], 'warp_weight_mean' : [], 'warp_weight_median' : [], 'warp_weight_min' : [], 'warp_weight_max' : [], 'warp_wait-time_mean_ps' : [], 'warp_wait-time_median_ps' : [], 'warp_wait-time_min_ps' : [], 'warp_wait-time_max_ps' : [], } for row_idx, row in spans_df.iterrows(): print(row['gexp'], row['span_id']) contig = get_gexp_span_contig(row['gexp'], row['span_id']) with contig: n_cycles = contig.num_cycles # calculate rates and fluxes total_weight, rate, total_sampling_time = contig_warp_rates( contig, CYCLE_TIME )[0][0] warps_df = contig.warping_records_dataframe() n_warps = len(warps_df) # the unweighted rate of warp events warps_per_sampling = n_warps / total_sampling_time total_warped_weight = warps_df['weight'].sum() # descriptive statistics on the warps weights mean_warp_weight = warps_df['weight'].mean() median_warp_weight = warps_df['weight'].median() min_warp_weight = warps_df['weight'].min() max_warp_weight = warps_df['weight'].max() wait_times_ps = warps_df['cycle_idx'] * CYCLE_TIME.in_units_of(tkunit.picosecond) # descriptive statistics on the warps waiting times mean_warp_wait_time = wait_times_ps.mean() median_warp_wait_time = wait_times_ps.median() min_warp_wait_time = wait_times_ps.min() max_warp_wait_time = wait_times_ps.max() new_cols['n_cycles'].append(n_cycles) new_cols['rate_ps'].append(rate.value_in_unit((1/tkunit.picosecond).unit)) new_cols['total_sampling_time_ps'].append(total_sampling_time.value_in_unit(tkunit.picosecond)) new_cols['n_warps'].append(n_warps) new_cols['total_warped_weight'].append(total_warped_weight) new_cols['warps_per_ps'].append(warps_per_sampling.value_in_unit((1/tkunit.picosecond).unit)) # warp weights statistics new_cols['warp_weight_mean'].append(mean_warp_weight) new_cols['warp_weight_median'].append(median_warp_weight) new_cols['warp_weight_min'].append(min_warp_weight) new_cols['warp_weight_max'].append(max_warp_weight) # warp wait time statistics new_cols['warp_wait-time_mean_ps'].append(mean_warp_wait_time) new_cols['warp_wait-time_median_ps'].append(median_warp_wait_time) new_cols['warp_wait-time_min_ps'].append(min_warp_wait_time) new_cols['warp_wait-time_max_ps'].append(max_warp_wait_time) # add the columns for colname, col in new_cols.items(): spans_df[colname] = col return spans_df def span_stats_table_str(): spans_df = get_span_stats_df() table_str = tabulate(spans_df, headers=spans_df.columns, tablefmt='orgtbl' ) return table_str #+end_src *** Rate Plots The calculated overall rates for the simulations are in the [[*Span Stats][Span Stats]]. These are the plots of the rates and weights over time. **** Plotting Functions #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py def plot_rates(run_target_weights_rates, cycle_time, experimental_rt, target_idx=0, min_probability=0.e-12, mfpt_unit=tkunit.second, time_unit=tkunit.picosecond, logscale=False): import numpy as np import scipy.stats import scipy.stats.mstats RUN_LABEL_TEMPLATE = "Run: {}" # collate data runs_weights = {} runs_rates = {} runs_rts = {} runs_times = {} for run_idx, target_weights_rates in run_target_weights_rates.items(): run_cum_weights = [] run_rates = [] run_times = [] for cycle_idx, timepoint in enumerate(target_weights_rates): # if there were no records (and thus no weights and rates # i.e. an empty dictionary) we set nans here so we can # easily discard them later but still keep track of time if not timepoint: #weight, rate, time = np.nan, np.nan, cycle_time * cycle_idx weight, rate, time = 0.0, 0.0 * 1/time_unit, cycle_time * cycle_idx else: weight, rate, time = timepoint[target_idx] run_cum_weights.append(weight) run_rates.append(rate) run_times.append(time) # calculate the residence times run_rts = [] for rate in run_rates: # we can't divide by zero with simtk units so we get the # value and invert it with a numpy float64 then # redimensionalize it run_rt = (np.float64(1.0) / rate.value_in_unit(rate.unit)) * (1/rate.unit) # if the rate is nan (which was 0.0 aggregated # probability) we catch this and explicitly set the # residence time to infinity # if np.isnan(rate): # run_rt = np.inf * time_unit # # otherwise compute the mfpt/residence time # else: # run_rt = 1/rate run_rts.append(run_rt) runs_weights[run_idx] = run_cum_weights runs_rates[run_idx] = run_rates runs_rts[run_idx] = run_rts runs_times[run_idx] = run_times # compute the average unbound probability and MFPT across all runs # aggregate all of the runs weights into a single array and fill # with nans where missing, these will be ignored later num_runs = len(runs_times) # get the longest runs number of time points max_num_timepoints = max([len(run_times) for run_times in runs_times.values()]) min_num_timepoints = min([len(run_times) for run_times in runs_times.values()]) # get an array of the actual times for the longest one longest_run_idx = sorted([(len(run_times), run_idx) for run_idx, run_times in runs_times.items()], reverse=True)[0][1] longest_run_times = runs_times[longest_run_idx] # the shortest one shortest_run_idx = sorted([(len(run_times), run_idx) for run_idx, run_times in runs_times.items()], reverse=False)[0][1] shortest_run_times = runs_times[shortest_run_idx] # make an array for all of them where the longest we have is the # shortest one all_weights = np.empty((num_runs, min_num_timepoints)) all_weights.fill(np.nan) # add the weights vectors to the array for i, run_weights in enumerate(runs_weights.values()): all_weights[i, 0:] = run_weights[0:min_num_timepoints] # compute the arithmetic mean, ignoring nans mean_weights = np.mean(all_weights, axis=0) # compute standard error of means sem_weights = scipy.stats.sem(all_weights, axis=0, nan_policy='raise') # get the bounds of the SEM envelope around the mean so we can # transform them upper_sem_bounds = mean_weights + sem_weights lower_sem_bounds = mean_weights - sem_weights # TODO compute harmonic mean as well of the rates # TODO: Alex wants me to calculate this just as a transformation # of the computed mean and std errors of the probability. Not sure # of the validity of this. shortest_run_time_values = np.array([time.value_in_unit(time_unit) for time in shortest_run_times]) # then compute MFPTs # mask the values for which no data was observed (nans) # calculate the rates for all the runs, masking the zero values # which need special tolerances due to the small values. These # small values are dependent upon the minimum probabilities # allowed by the simulation so this must be a parameter. #masked_mean_weights = np.ma.masked_invalid(mean_weights) # actually calculate the average mfpts from the mean weights over # time mfpts = shortest_run_time_values / mean_weights # compute the error in the rates using the relationship between # the MFPT and the weight: MFPT_error = weight_error * t * weight^2 mfpt_sems = sem_weights * shortest_run_time_values * (mean_weights**-2) upper_sem_mfpts = mfpts + mfpt_sems lower_sem_mfpts = mfpts - mfpt_sems # clip the things that are close to zero or less than zero since # this is just a floating point subtraction error close_to_zero_idx = np.argwhere(np.logical_or(np.isclose(lower_sem_mfpts, 0.), lower_sem_mfpts < 0.)) lower_sem_mfpts[close_to_zero_idx] = 0. # then compute the bounds for the MFPTs # upper_sem_mfpts = shortest_run_time_values / upper_sem_bounds # lower_sem_mfpts = shortest_run_time_values / lower_sem_bounds # because the lower standard error bound is lower than the average # we want to make sure the call to isclose doesn't count them as 0 # so we make a new min_prob for the masking that takes into # account the lowest value. So we take the minimum value that is # not identically 0 # lower_min_prob = lower_sem_bounds[lower_sem_bounds > 0.].min() * 1e-2 # actually we are going to consder anythin 2 orders of magnitude # lower than the minimum probability to be zero so: # lower_sem_bound_min_value = min_probability * 1e-2 # zero_masked_upper_sem_weights = np.ma.masked_values(upper_sem_bounds, 0., # atol=min_probability) # zero_masked_lower_sem_weights = np.ma.masked_values(lower_sem_bounds, 0., # atol=lower_min_prob) # # we calculate the standard error of the mean weights as well # zero_masked_upper_sem_mfpts = (shortest_run_time_values / # zero_masked_upper_sem_weights) # zero_masked_lower_sem_mfpts = (shortest_run_time_values / # zero_masked_lower_sem_weights) # make the plots, one for aggregate probability the other for # RT fig, axes = plt.subplots(1, 2) prob_ax = axes[0] rt_ax = axes[1] for run_idx in run_target_weights_rates.keys(): # do the weights plot # get the times and weights converting if necessary run_weights = runs_weights[run_idx] run_times = [time.value_in_unit(time_unit) for time in runs_times[run_idx]] # slice them to the smallest run length run_weights = run_weights[0:min_num_timepoints] run_times = run_times[0:min_num_timepoints] label = RUN_LABEL_TEMPLATE.format(run_idx) # TODO set color better prob_ax.plot(run_times, run_weights, label=label, linewidth='3') # then do the residence time plots run_rts = runs_rts[run_idx] # convert the residence times to the time unit specified run_rts = [rt * time_unit for rt in run_rts] shortest_run_times_values = [time.value_in_unit(time_unit) for time in shortest_run_times] prob_ax.plot(shortest_run_times_values, mean_weights, label='Mean', linewidth='3', color='black') prob_ax.fill_between(shortest_run_times_values, mean_weights-sem_weights, mean_weights+sem_weights, label='Std. Error') # get the units from the value of the rates # rate_unit = runs_rates[list(runs_rates.keys())[0]][0].get_symbol() # # convert to the given unit as a rate # if time_unit is not None: # rate_unit = 1 / (1 / rate_unit).in_units_of(time_unit) prob_ax.set_yscale('log') prob_ax.set_ylabel('Aggregated unbound probability') # TODO set this intelligently prob_ax.set_ylim([1e-13, 1e-2]) prob_ax.set_xlabel('Simulation time (${}$)'.format(time_unit.get_symbol())) prob_ax.legend(loc='best') # plot the average rate # get the times and convert to the proper units times = [time.value_in_unit(time_unit) for time in shortest_run_times] # first because they were dedimensionalized we redimensionalize # them to the specified time_unit (x-axis) then convert them to # the MFPT unit, then dedimensionalize them to raw values for # plotting mfpt_values = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit) for mfpt in mfpts]) upper_sem_mfpt_values = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit) for mfpt in upper_sem_mfpts]) lower_sem_mfpt_values = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit) for mfpt in lower_sem_mfpts]) # clip the things that are close to zero or less than zero since # this is just a floating point error from multiplying... close_to_zero_idx = np.argwhere(np.logical_or(np.isclose(lower_sem_mfpt_values, 0.), lower_sem_mfpts < 0.)) lower_sem_mfpt_values[close_to_zero_idx] = 0. rt_ax.plot(times, mfpt_values, label='Mean', linewidth='3', color='black') rt_ax.fill_between(times, upper_sem_mfpt_values, lower_sem_mfpt_values, label='Std. Error of Mean') if logscale: rt_ax.set_yscale('log') # plt.xticks([400, 800, 1200, 1600], ['400', '800', '1200', '1600']) # plt.yticks([0.1, 10, 1000, 100000, 10000000, 1000000000]) rt_ax.set_xlabel('Simulation time ({})'.format(time_unit.get_symbol())) rt_ax.set_ylabel('Predicted residence time ({})'.format(mfpt_unit.get_symbol())) #rt_ax.set_ylim([1e-1, 1e9]) # plot the experimental value experimental_points = [experimental_rt.value_in_unit(mfpt_unit) for _ in run_times] rt_ax.plot(times, experimental_points, color='r', label='Experimental') rt_ax.legend() # return the actual computed values for the average aggregated # weights and residence times mfpts = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit) for mfpt in mfpts]) * mfpt_unit mfpt_sems = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit) for mfpt in mfpt_sems]) * mfpt_unit return mean_weights, sem_weights, mfpts, mfpt_sems, shortest_run_times, (fig, prob_ax, rt_ax) #+END_SRC **** Plot Rates and RT Actually perform the plotting for different ligands and runs. For this we want to plot each run independently and then aggregate them. #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py def gexp_plot_rates_rt(gexp): import matplotlib.pyplot as plt from wepy.analysis.rates import contig_warp_rates span_rates = {} for span_id in get_gexp_span_ids(gexp): contig = get_gexp_span_contig(gexp, span_id) with contig: # get the values as a series contig_rates = contig_warp_rates(contig, CYCLE_TIME, time_points=Ellipsis) span_rates[span_id] = contig_rates # make the plot for each span mean_weights, sem_weights, mfpts, mfpt_sems, total_sampling_times, axes = \ plot_rates(span_rates, CYCLE_TIME, EXPERIMENTAL_RT, target_idx=0, min_probability=WE_PMIN, mfpt_unit=tkunit.minute, time_unit=tkunit.microsecond, logscale=True) print("Gexp: {}".format(gexp)) print("Final MFPT: {}".format(mfpts[-1])) print("Final MFPT SEM: {}".format(mfpt_sems[-1])) # TODO: the total sampling time will be gotten another way ################################################################## # print("Total sampling time: {}".format( # # total_sampling_times[-1].value_in_unit(tkunit.microsecond) # # * tkunit.microsecond # # * len(contigtree.span_traces)) # # ) # ################################################################## plt.show() #+END_SRC *** Free Energy Profiles ** Prefect Workflows *** Header #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_pipelines.py """Generated file from the analysis.org file. Do not edit directly.""" import inspect from prefect import Flow import prefect import wepy.lysozyme_test._tasks as tasks_module # these helper functions are for automatically listing all of the # functions defined in the tasks module def is_mod_function(mod, func): return inspect.isfunction(func) and inspect.getmodule(func) == mod def get_functions(mod): # get only the functions that aren't module functions and that # aren't private return {func.__name__ : func for func in mod.__dict__.values() if (is_mod_function(mod, func) and not func.__name__.startswith('_')) } # get the task functions and wrap them as prefect tasks tasks = {name : prefect.task(func) for name, func in get_functions(tasks_module).items()} #+END_SRC *** Example #+begin_src python :tangle src/wepy_lysozyme_test/_pipelines.py test_flow = Flow("Test flow") # you can add tasks this way: with test_flow: result = tasks['test']() #+end_src ** Execution A less heavyweight alternative to running pipelines like below. Each execution instance will become a submodule of the 'project_name.execution' module. You can run them like this: #+begin_src bash python -m project_name.execution.my_execution_script #+end_src Execution scripts should be self contained in terms of domain parameters. An execution script may have command line parameters related to execution tweaking. I.e. which dask cluster to use, how many cores, etc. **** Executors Functions that allow for specifying different executions. These should only be called under ~if __name__ == "__main__"~ blocks as they will ask for command line input. ***** Local Machine Trivial example of an executor that just runs the function. #+begin_src python :tangle src/wepy_lysozyme_test/execution/__init__.py def execute_locally(func): func() #+end_src ***** Local Dask Cluster Either connect to an existing dask cluster or start one up locally. #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/execution/__init__.py def dask_execute(func, processes=False, n_workers=4): import sys from dask.distributed import Client, LocalCluster cluster_address = sys.argv[1] DASHBOARD_PORT = 9998 if cluster_address == ':local': cluster = LocalCluster(processes=processes, n_workers=n_workers, dashboard_address=":{}".format(DASHBOARD_PORT)) print("Ad hoc cluster online. Dashboard on port {}".format(DASHBOARD_PORT)) client = Client(cluster) else: client = Client(cluster_address) func(client) #+END_SRC ***** Prefect Pipeline **** Scripts ***** Example: Raw An example showing that you don't need any framework to help you run something. While tasks should be functional (and the only state saved is caching) you can handle side effects like saving files etc. here. #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/execution/example_raw.py def make_result(message): from wepy.lysozyme_test._tasks import test test() return "Here is the result: " + message if __name__ == "__main__": result = make_result("Testing execution out") #+END_SRC ***** Example: Using a Dask Cluster #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/execution/example_dask.py # the function here where the first argument must be a client to the # cluster def func_closure(client): from wepy.lysozyme_test._tasks import important_calculation result = client.submit(important_calculation, "logging..").result() # this defines which format to save it in, we are using the joblib # pickle format ext = 'jl.pkl' result_file_path = osp.join(data_path(), f'my_results/result_A.{ext}') # save the result to the file data store in the joblib pickle # format save_obj(result_file_path, result, overwrite=True, ext='jl.pkl') if __name__ == "__main__": from wepy.lysozyme_test.execution import execute # execute and receive options from command line dask_execute(func_closure) #+END_SRC ***** Example: Prefect Flow #+BEGIN_SRC python :tangle src/wepy_lysozyme_test/execution/example_flow.py if __name__ == "__main__": from prefect.engine.executors import Executor # get the flow from wepy_lysozyme_test._pipelines import test_flow # instantiate an executor from Prefect. We use the local one here # for testing executor = Executor() # run the flow with the executor state = test_flow.run(executor=executor) #+END_SRC ***** Link Results #+begin_src python :tangle src/wepy_lysozyme_test/execution/link_results.py def link_results(): from wepy_lysozyme_test._tasks import link_gexp_hdf5s, EXPERIMENTAL_GROUPS for gexp in EXPERIMENTAL_GROUPS: path = link_gexp_hdf5s(gexp) print(f"Experimental group {gexp} results at {path}") if __name__ == "__main__": link_results() #+end_src ***** Resampling Tree Visualizations #+begin_src python :tangle src/wepy_lysozyme_test/execution/gen_restrees.py import click @click.command() @click.argument('gexp') def gen_restrees(gexp): from wepy_lysozyme_test._tasks import ( get_gexp_span_ids, save_restree_gexf, ) for span_id in get_gexp_span_ids(gexp): save_restree_gexf(gexp, span_id) if __name__ == "__main__": gen_restrees() #+end_src ***** Warping lineage trajectories ****** Separated #+begin_src python :tangle src/wepy_lysozyme_test/execution/gen_warp_lineages_multi.py import click @click.command() @click.argument('gexp') def gen_warp_lineages_multi(gexp): from wepy_lysozyme_test._tasks import ( get_gexp_span_ids, save_warp_lineages_dcds, ) print(f"Working on gexp: {gexp}") for span_id in get_gexp_span_ids(gexp): print(f"Working on span {span_id}") save_warp_lineages_dcds(gexp, span_id) if __name__ == "__main__": gen_warp_lineages_multi() #+end_src ****** Concatenated #+begin_src python :tangle src/wepy_lysozyme_test/execution/gen_warp_lineages_single.py import click @click.command() @click.argument('gexp') def gen_warp_lineages_single(gexp): from wepy_lysozyme_test._tasks import ( get_gexp_span_ids, save_warp_lineages_dcd, ) print(f"Working on gexp: {gexp}") for span_id in get_gexp_span_ids(gexp): print(f"Working on span {span_id}") save_warp_lineages_dcd(gexp, span_id) if __name__ == "__main__": gen_warp_lineages_single() #+end_src ***** Warp plots #+begin_src python :tangle src/wepy_lysozyme_test/execution/show_rate_plots.py import click @click.command() @click.argument('gexp') def show_rate_plots(gexp): from wepy_lysozyme_test._tasks import ( gexp_plot_rates_rt, ) gexp_plot_rates_rt(gexp) if __name__ == "__main__": show_rate_plots() #+end_src ***** Contig Stats tables #+begin_src python :tangle src/wepy_lysozyme_test/execution/gen_contig_stats_table.py import click @click.command() def gen_contig_stats_table(): from tabulate import tabulate from wepy_lysozyme_test._tasks import ( save_span_stats_table, span_stats_table_str, ) save_span_stats_table(overwrite=True) click.echo(span_stats_table_str()) if __name__ == "__main__": gen_contig_stats_table() #+end_src * Invocations The actual invocations you will make on the command line to run stuff. Use TODO or checkboxes to manage them. ** NoResampler test jobs *** DONE <2019-11-15 Fri 18:25> #+begin_src bash scancel 49420612 #+end_src *** DONE <2019-11-19 Tue> #+begin_src bash (cd hpcc/wepy && sbatch submissions/noresampler_test.sh.slurm) #+end_src #+begin_example (base) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Tue Nov 1912:12:21} 0: --> (cd hpcc/wepy && sbatch submissions/noresampler_test.sh.slurm) Submitted batch job 49870580 #+end_example *** DONE <2019-11-19 Tue 13:20> #+begin_src bash (cd hpcc/wepy && sbatch submissions/noresampler_test.sh.slurm) #+end_src #+begin_example (wepy.lysozyme_test.sims) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Tue Nov 1901:19:55} 0: --> (cd hpcc/wepy && sbatch submissions/noresampler_test.sh.slurm) Submitted batch job 49874750 #+end_example ** Resampler Comparison Simulation jobs *** DONE Run tests with new setups #+begin_src bash (cd hpcc/wepy && ( sbatch submissions/revo_test.sh.slurm; sbatch submissions/wexplore_test.sh.slurm; )) #+end_src *** DONE Run jobs Run 3-4 replicates each: #+begin_src bash (cd hpcc/wepy && ( # the resampler ones for i in {0,1,2,3}; do sbatch submissions/noresampler_test.sh.slurm; sbatch submissions/revo_test.sh.slurm; sbatch submissions/wexplore_test.sh.slurm; done )) #+end_src **** <2019-11-20 Wed> #+begin_example (base) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Wed Nov 2002:44:56} 0: --> (cd hpcc/wepy && ( > # the resampler ones > for i in {0,1,2,3}; do > sbatch submissions/noresampler_test.sh.slurm; > sbatch submissions/revo_test.sh.slurm; > sbatch submissions/wexplore_test.sh.slurm; > done > )) Submitted batch job 49952117 Submitted batch job 49952118 Submitted batch job 49952119 Submitted batch job 49952120 Submitted batch job 49952121 Submitted batch job 49952122 Submitted batch job 49952123 Submitted batch job 49952124 Submitted batch job 49952125 Submitted batch job 49952126 Submitted batch job 49952127 Submitted batch job 49952128 #+end_example **** <2019-11-21 Thu 12:10> I changed the file names out from under the job scheduler so some of them failed. Resubmit them: #+begin_src bash (cd hpcc/wepy && sbatch submissions/revo.sh.slurm; sbatch submissions/wexplore.sh.slurm ) #+end_src #+begin_example (common) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Thu Nov 2112:05:58} 0: --> (cd hpcc/wepy && > sbatch submissions/revo.sh.slurm; > sbatch submissions/wexplore.sh.slurm > ) Submitted batch job 50036052 Submitted batch job 50036053 (common) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Thu Nov 2112:11:50} 0: --> (cd hpcc/wepy && sbatch submissions/noresampler_ensemble.sh.slurm; ) Submitted batch job 50036078 #+end_example ** Single walker simulation jobs Run 3-4 replicates each: #+begin_src bash (cd hpcc/wepy && ( # the resampler ones for i in {0,1,2,3}; do sbatch submissions/noresampler_single.sh.slurm; done )) #+end_src *** <2019-11-21 Thu 12:06> #+begin_example (common) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Thu Nov 2112:05:39} 0: --> (cd hpcc/wepy && ( > # the resampler ones > for i in {0,1,2,3}; do > sbatch submissions/noresampler_single.sh.slurm; > done > )) Submitted batch job 50035864 Submitted batch job 50035866 Submitted batch job 50035867 Submitted batch job 50035868 #+end_example ** Simulations Backups and saving *** <2019-12-03 Tue 13:21> - [X] First get them to the local computer #+begin_src bash rsync -ravvhhiz \ lotzsamu@rsync.hpcc.msu.edu:/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs/ \ $HOME/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs #+end_src - [X] Pop the links out to resources #+begin_src bash symlink-pop /home/salotz/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs \ /home/salotz/tree/lab/resources/project-resources/wepy.lysozyme_test/hpcc/wepy/jobs #+end_src - [ ] then back them up to volta Just doing the whole tree since that needs to be done anyways. #+begin_src bash inv refugue.rsync --source superior --target volta #+end_src - [ ] backup to allegheny #+begin_src bash inv refugue.rsync --source superior --target allegheny #+end_src ** Analysis *** Resampling Trees **** <2019-12-05 Thu 19:09> - [ ] #+begin_src bash python -m wepy_lysozyme_test.execution.gen_restrees 'WExplore' #+end_src - [ ] #+begin_src bash python -m wepy_lysozyme_test.execution.gen_restrees 'REVO' #+end_src This isn't working because they are too long. *** Warping Lineages **** <2019-12-09 Mon 13:04> To get them all in one trajectory file: This is failing because some of them are too large I think. #+begin_src bash (for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do python -m wepy_lysozyme_test.execution.gen_warp_lineages $gexp || exit 1 done ) #+end_src Try this without all of them. #+begin_src bash python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No' python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single' python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO' python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore' #+end_src #+begin_src bash # No python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No' 0 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No' 1 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No' 2 # No_single python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single' 0 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single' 1 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single' 2 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single' 3 # REVO python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO' 0 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO' 1 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO' 2 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO' 3 # WExplore python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore' 0 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore' 1 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore' 2 python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore' 3 #+end_src **** <2019-12-10 Tue 11:50> After fixing recursion issue this should be better now. #+begin_src bash (for gexp in 'No' 'No_single' 'REVO'; do python -m wepy_lysozyme_test.execution.gen_warp_lineages_multi $gexp || exit 1 done ) #+end_src This worked so well that it froze my machine. I'm not sure where it failed, but it must have been on REVO because there are none written for it. Lets try this one at a time again. #+begin_src bash python -m wepy_lysozyme_test.execution.gen_warp_lineages_single 'WExplore' python -m wepy_lysozyme_test.execution.gen_warp_lineages_single 'REVO' #+end_src *** Spans Table **** <2019-12-10 Tue 11:52> #+begin_src bash python -m wepy_lysozyme_test.execution.gen_contig_stats_table #+end_src Here is the result in org format: | | gexp | span_id | n_runs | n_warps | n_cycles | total_sampling_time_ps | warps_per_ps | rate_ps | total_warped_weight | warp_weight_mean | warp_weight_median | warp_weight_min | warp_weight_max | warp_wait-time_mean_ps | warp_wait-time_median_ps | warp_wait-time_min_ps | warp_wait-time_max_ps | |----+-----------+-----------+----------+-----------+------------+--------------------------+----------------+-------------+-----------------------+--------------------+----------------------+-------------------+-------------------+--------------------------+----------------------------+-------------------------+-------------------------| | 0 | No | 0 | 1 | 7 | 1101 | 1.05696e+06 | 6.62277e-06 | 1.37974e-07 | 0.145833 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 18317.1 | 17960 | 14580 | 21240 | | 1 | No | 1 | 1 | 21 | 4645 | 4.4592e+06 | 4.70936e-06 | 9.81118e-08 | 0.4375 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 48672.4 | 41940 | 9760 | 92120 | | 2 | No | 2 | 1 | 4 | 1107 | 1.06272e+06 | 3.76393e-06 | 7.84151e-08 | 0.0833333 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 14915 | 15830 | 6460 | 21540 | | 3 | No | 3 | 1 | 7 | 1139 | 1.09344e+06 | 6.40181e-06 | 1.33371e-07 | 0.145833 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 18940 | 19580 | 11420 | 22580 | | 4 | No_single | 0 | 1 | 2 | 42914 | 858280 | 2.33024e-06 | 2.33024e-06 | 2 | 1 | 1 | 1 | 1 | 759060 | 759060 | 748720 | 769400 | | 5 | No_single | 1 | 1 | 2 | 41619 | 832380 | 2.40275e-06 | 2.40275e-06 | 2 | 1 | 1 | 1 | 1 | 730970 | 730970 | 659480 | 802460 | | 6 | No_single | 2 | 1 | 4 | 42058 | 841160 | 4.75534e-06 | 4.75534e-06 | 4 | 1 | 1 | 1 | 1 | 120240 | 119320 | 44640 | 197680 | | 7 | No_single | 3 | 1 | 3 | 42642 | 852840 | 3.51766e-06 | 3.51766e-06 | 3 | 1 | 1 | 1 | 1 | 510460 | 478380 | 449320 | 603680 | | 8 | REVO | 0 | 1 | 1381 | 1142 | 1.09632e+06 | 0.00125967 | 3.6137e-08 | 0.0396177 | 2.86877e-05 | 5.29838e-11 | 1.00581e-12 | 0.0308164 | 13088 | 13760 | 1100 | 22780 | | 9 | REVO | 1 | 1 | 1458 | 1126 | 1.08096e+06 | 0.0013488 | 3.22975e-09 | 0.00349123 | 2.39453e-06 | 1.01315e-10 | 1.01126e-12 | 0.00152529 | 14434 | 14900 | 320 | 22500 | | 10 | REVO | 2 | 1 | 469 | 1103 | 1.05888e+06 | 0.000442921 | 3.23438e-13 | 3.42482e-07 | 7.3024e-10 | 5.8895e-12 | 1.0023e-12 | 2.54594e-08 | 12041.7 | 11660 | 340 | 21900 | | 11 | REVO | 3 | 1 | 1029 | 1100 | 1.056e+06 | 0.000974432 | 5.1951e-09 | 0.00548602 | 5.33141e-06 | 1.69055e-11 | 1.00155e-12 | 0.00320306 | 10512.9 | 9380 | 160 | 21980 | | 12 | WExplore | 0 | 1 | 70 | 1134 | 1.08864e+06 | 6.43004e-05 | 7.02838e-08 | 0.0765138 | 0.00109305 | 9.23911e-08 | 1.92453e-12 | 0.0387847 | 11624.3 | 11340 | 420 | 21720 | | 13 | WExplore | 1 | 1 | 85 | 1129 | 1.08384e+06 | 7.84249e-05 | 1.58479e-07 | 0.171766 | 0.00202078 | 1.91101e-08 | 4.27336e-12 | 0.0670075 | 10480.9 | 9360 | 920 | 22320 | | 14 | WExplore | 2 | 1 | 264 | 4585 | 4.4016e+06 | 5.99782e-05 | 8.50116e-09 | 0.0374187 | 0.000141738 | 1.07901e-07 | 1.04765e-12 | 0.0138983 | 47844.8 | 50220 | 1680 | 91560 | | 15 | WExplore | 3 | 1 | 67 | 1095 | 1.0512e+06 | 6.37367e-05 | 3.54706e-07 | 0.372867 | 0.00556517 | 5.52999e-07 | 1.84583e-12 | 0.357752 | 11421.2 | 11540 | 320 | 21300 | *** Rates **** <2019-12-10 Tue 21:43> #+begin_src bash (for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do python -m wepy_lysozyme_test.execution.show_rate_plots $gexp || exit 1 done) #+end_src For the 'No' simulations #+begin_example Traceback (most recent call last): File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/runpy.py", line 193, in _run_module_as_main "__main__", mod_spec) File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/execution/show_rate_plots.py", line 15, in show_rate_plots() File "/home/salotz/.local/lib/python3.6/site-packages/click/core.py", line 764, in __call__ return self.main(*args, **kwargs) File "/home/salotz/.local/lib/python3.6/site-packages/click/core.py", line 717, in main rv = self.invoke(ctx) File "/home/salotz/.local/lib/python3.6/site-packages/click/core.py", line 956, in invoke return ctx.invoke(self.callback, **ctx.params) File "/home/salotz/.local/lib/python3.6/site-packages/click/core.py", line 555, in invoke return callback(*args, **kwargs) File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/execution/show_rate_plots.py", line 11, in show_rate_plots gexp_plot_rates_rt(gexp) File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 1333, in gexp_plot_rates_rt contig = get_gexp_span_contig(gexp, span_id) File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 672, in get_gexp_span_contig span_ids_to_span_idxs_d = get_gexp_span_ids_span_idxs_map(gexp) File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/joblib/memory.py", line 568, in __call__ return self._cached_call(args, kwargs)[0] File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/joblib/memory.py", line 534, in _cached_call out, metadata = self.call(*args, **kwargs) File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/joblib/memory.py", line 734, in call output = self.func(*args, **kwargs) File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 653, in get_gexp_span_ids_span_idxs_map for span_id, run_idxs in get_gexp_span_ids_run_idxs(gexp).items()} File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 583, in get_gexp_span_ids_run_idxs jobs_runs_d = get_gexp_jobs_to_runs_map(gexp) File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 554, in get_gexp_jobs_to_runs_map jobid = run_grp.attrs['jobid'] File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/h5py/_hl/attrs.py", line 60, in __getitem__ attr = h5a.open(self._id, self._e(name)) File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper File "h5py/h5a.pyx", line 77, in h5py.h5a.open KeyError: "Can't open attribute (can't locate attribute: 'jobid')" #+end_example **** <2019-12-11 Wed 15:26> #+begin_src bash (for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do python -m wepy_lysozyme_test.execution.show_rate_plots $gexp || exit 1 done) #+end_src *** TODO Compute Observables ** Troubleshooting *** TODO jobids missing *** DONE Bad heap HDF5 thing #+begin_example In [5]: get_gexp_span_contig('No', 3) --------------------------------------------------------------------------- OSError Traceback (most recent call last) in ----> 1 get_gexp_span_contig('No', 3) ~/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py in get_gexp_span_contig(gexp, span_id) 557 # get the contig for this span 558 contigtree = get_contigtree(gexp) --> 559 contig = contigtree.span_contig(span_idx) 560 561 return contig ~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in span_contig(self, span_idx) 215 """Generates a contig object for the specified spanning contig.""" 216 --> 217 contig = self.make_contig(self.span_traces[span_idx]) 218 219 return contig ~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in make_contig(self, contig_trace) 1329 continuations=continuations, 1330 boundary_condition_class=self.boundary_condition_class, -> 1331 decision_class=self.decision_class) 1332 1333 ~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in __init__(self, wepy_h5, **kwargs) 1413 1414 # use the superclass initialization -> 1415 super().__init__(wepy_h5, **kwargs) 1416 1417 # check that the result is a single contig ~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in __init__(self, wepy_h5, base_contigtree, continuations, runs, boundary_condition_class, decision_class) 1207 runs=runs, 1208 boundary_condition_class=boundary_condition_class, -> 1209 decision_class=decision_class) 1210 1211 self._set_base_contigtree_to_self(new_contigtree) ~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in __init__(self, wepy_h5, continuations, runs, boundary_condition_class, decision_class) 175 self._create_tree(wepy_h5) 176 --> 177 self._set_resampling_panels(wepy_h5) 178 179 if self._decision_class is not None: ~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in _set_resampling_panels(self, wepy_h5) 264 for run_idx in self.run_idxs: 265 --> 266 run_resampling_panel = wepy_h5.run_resampling_panel(run_idx) 267 268 # add each cycle of this panel to the network by adding ~/tree/lab/devel/wepy/src/wepy/hdf5.py in run_resampling_panel(self, run_idx) 4883 4884 """ -> 4885 return self.run_contig_resampling_panel([run_idx]) 4886 4887 def run_contig_resampling_panel(self, run_idxs): ~/tree/lab/devel/wepy/src/wepy/hdf5.py in run_contig_resampling_panel(self, run_idxs) 4908 4909 # make the resampling panel from the resampling records for the contig -> 4910 contig_resampling_panel = resampling_panel(self.resampling_records(run_idxs), 4911 is_sorted=False) 4912 ~/tree/lab/devel/wepy/src/wepy/hdf5.py in resampling_records(self, run_idxs) 4680 """ 4681 -> 4682 return self.run_contig_records(run_idxs, RESAMPLING) 4683 4684 def resampling_records_dataframe(self, run_idxs): ~/tree/lab/devel/wepy/src/wepy/hdf5.py in run_contig_records(self, run_idxs, run_record_key) 4613 # sporadic then we just use the cycle idxs 4614 if self._is_sporadic_records(run_record_key): -> 4615 records = self._run_records_sporadic(run_idxs, run_record_key) 4616 else: 4617 records = self._run_records_continual(run_idxs, run_record_key) ~/tree/lab/devel/wepy/src/wepy/hdf5.py in _run_records_sporadic(self, run_idxs, run_record_key) 2055 # get all the value columns from the datasets, and convert 2056 # them to something amenable to a table -> 2057 run_fields = self._convert_record_fields_to_table_columns(run_idx, run_record_key) 2058 2059 # we need to concatenate each field to the end of the ~/tree/lab/devel/wepy/src/wepy/hdf5.py in _convert_record_fields_to_table_columns(self, run_idx, run_record_key) 1978 for record_field in self.record_fields[run_record_key]: 1979 fields[record_field] = self._convert_record_field_to_table_column( -> 1980 run_idx, run_record_key, record_field) 1981 1982 return fields ~/tree/lab/devel/wepy/src/wepy/hdf5.py in _convert_record_field_to_table_column(self, run_idx, run_record_key, record_field) 1931 # cast all elements to tuples 1932 if h5py.check_dtype(vlen=dset.dtype) is not None: -> 1933 rec_dset = [tuple(value) for value in dset[:]] 1934 1935 # if it is not variable length make sure it is not more than a h5py/_objects.pyx in h5py._objects.with_phil.wrapper() h5py/_objects.pyx in h5py._objects.with_phil.wrapper() ~/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/h5py/_hl/dataset.py in __getitem__(self, args) 571 mspace = h5s.create_simple(mshape) 572 fspace = selection.id --> 573 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl) 574 575 # Patch up the output for NumPy h5py/_objects.pyx in h5py._objects.with_phil.wrapper() h5py/_objects.pyx in h5py._objects.with_phil.wrapper() h5py/h5d.pyx in h5py.h5d.DatasetID.read() h5py/_proxy.pyx in h5py._proxy.dset_rw() h5py/_proxy.pyx in h5py._proxy.H5PY_H5Dread() OSError: Can't read data (bad global heap collection signature) #+end_example 50036078 is the job ID it is failing on. Lets try recopying it. Maybe it had an error. Found out that specifically it is the target idxs that are kind of screwy. #+begin_example (pdb++)>>> no = self._convert_record_field_to_table_column(run_idx, run_record_key, 'target_idxs') ,*** OSError: Can't read data (bad global heap collection signature) Traceback (most recent call last): File "/home/salotz/tree/lab/devel/wepy/src/wepy/hdf5.py", line 1933, in _convert_record_field_to_table_column rec_dset = [tuple(value) for value in dset[:]] File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/h5py/_hl/dataset.py", line 573, in __getitem__ self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl) File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper File "h5py/h5d.pyx", line 182, in h5py.h5d.DatasetID.read File "h5py/_proxy.pyx", line 158, in h5py._proxy.dset_rw File "h5py/_proxy.pyx", line 84, in h5py._proxy.H5PY_H5Dread #+end_example **** Trying to read them straight from the allegheny disk locally #+begin_src python from wepy.hdf5 import WepyHDF5 path = '/media/salotz/allegheny/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root.wepy.h5' wepy_h5 = WepyHDF5(path, mode='r') with wepy_h5: dset = wepy_h5.records_grp(0, 'resampling')['target_idxs'] dset[:] #+end_src **** HPCC's copy #+begin_src python from wepy.hdf5 import WepyHDF5 path = '/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs/50036078/root.wepy.h5' wepy_h5 = WepyHDF5(path, mode='r') with wepy_h5: dset = wepy_h5.records_grp(0, 'resampling')['target_idxs'] dset[:] #+end_src Okay this one is fine. Something happened while it was being copied... **** solution Lets get it back. This might take a while... #+begin_src bash rsync lotzsamu@rsync.hpcc.msu.edu:/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs/50036078/root.wepy.h5 \ /home/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root.wepy.h5 #+end_src Its actually only 9 GB so it should be fine. Then copy it manually to the drive to make sure it overrides it. #+begin_src bash rsync /home/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root.wepy.h5 \ /media/salotz/allegheny/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root.wepy.h5 #+end_src I had renamed them so get rid of the bad ones once we know it works: #+begin_src bash rm -f /home/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root_bad.wepy_h5 rm -f /media/salotz/allegheny/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root_bad.wepy.h5 #+end_src *** DONE Getting all lineages of terminal walkers #+begin_src python :tangle troubleshoot/terminal_lineages.py from wepy.hdf5 import WepyHDF5 from wepy.analysis.contig_tree import BaseContigTree, ContigTree from wepy.resampling.decisions.clone_merge import MultiCloneMergeDecision from wepy.resampling.decisions.decision import NoDecision from wepy.boundary_conditions.receptor import UnbindingBC from wepy_lysozyme_test._tasks import * test_h5_path = project_path() / "tmp/test_wepy_sim/root.wepy.h5" wepy_h5 = WepyHDF5(test_h5_path, mode='r') base_contigtree = BaseContigTree(wepy_h5, decision_class=MultiCloneMergeDecision, boundary_condition_class=UnbindingBC) contigtree = ContigTree(wepy_h5, base_contigtree=base_contigtree) res_trace = contigtree.resampling_trace(MultiCloneMergeDecision.ENUM.SQUASH) warp_trace = contigtree.warp_trace() final_trace = contigtree.final_trace() # deduplicate and make the final collection of terminals in a trace terminal_trace = list(set(res_trace) | set(warp_trace) | set(final_trace)) # then we can get all the lineages for them lines = contigtree.lineages(terminal_trace) # TODO: do the full parent tree traversals # make the parent tree parent_forest = ParentForest(contig=contig) # preorder traversal preorder_traversal_trace = list(nx.dfs_preorder_nodes(parent_forest._graph)) # convert to a walker trace preorder_traversal_trace = contig.walker_trace_to_run_trace( [(traj_idx, cycle_idx) for cycle_idx, traj_idx in preorder_traversal_trace if cycle_idx > -1]) span_data['preorder_traversal_trace'] = preorder_traversal_trace # post order traversal postorder_traversal_trace = list(nx.dfs_postorder_nodes(parent_forest._graph)) # convert to a walker trace postorder_traversal_trace = contig.walker_trace_to_run_trace( [(traj_idx, cycle_idx) for cycle_idx, traj_idx in postorder_traversal_trace if cycle_idx > -1]) span_data['postorder_traversal_trace'] = postorder_traversal_trace ## cohort trajs #+end_src *** DONE Running Test Simulations Run a simulation: #+begin_src bash ./scripts/noresampler_local_test.sh #+end_src Now you have some data at: tmp/test_wepy_sims/root.wepy.h5 * Management Area for managed data like lists and spreadsheets. ** Research *** Info on the lysozyme system See citations on bibsonomy.org: https://www.bibsonomy.org/search/tags:lysozyme%20group:comp-pharm-bio *** Papers: **** cite:Baase2010T4LysozymeReview Review on what the Lysozyme system has taught use. Details on the L99A mutation cavity formation. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2867005/ **** cite:NunesAlves2018T4LysozymeUnbinding Paper that does WE simulations with a bunch of hand-crafted progress coordinates in WESTPA. https://www.sciencedirect.com/science/article/pii/S0006349518301401 #+BEGIN_QUOTE The experimental rate constant for benzene dissociation () from T4L L99A mutant has been determined (58) by lineshape analysis of protein NMR as 950 s–1 at 303 K. #+END_QUOTE #+BEGIN_SRC python import simtk.unit as unit rate = 950 * (1/unit.second) rt = 1/ rate print(rt) #+END_SRC - residence time :: 0.0010526315789473684 s - PDB structure :: 1L83 https://www.rcsb.org/structure/1L83 MD parameters: - CHARMM36 - implicit solvation :: Born surface area form - born radii :: OBC - dialectric constant :: 80 - nonpolar contributions :: M. Schaefer, C. Bartels, M. Karplus Solution conformations and thermodynamics of structured peptides: molecular dynamics simulation with an implicit solvation model J. Mol. Biol., 284 (1998), pp. 835-848 - surface tension :: 5.4 cal mol^{-1} \AA^{-2} - temperature :: 300 K - integration step size :: 2 fs - integrator :: leapfrog stochastic - collision frequency :: 10 ps^{-1} - Hydrogen-Heavy atoms constrained :: LINCS (faster SHAKE algorithm) WE parameters: - cycles :: 4000 - cycle time :: 10 ps at 400 K, 2 ps at 300 K - traj-per-bin :: 4-5 **** cite:Wang2016LysozymeMetadynamicsUnbinding Metadynamics study of ligand unbinding from lysozyme. **** cite:Feher1996LysozymeBinding Experimental rate constant paper. **** Bibtex entries #+BEGIN_SRC bibtex :tangle README.bib @article{Baase2010T4LysozymeReview, abstract = {An overview is presented of some of the major insights that have come from studies of the structure, stability, and folding of T4 phage lysozyme. A major purpose of this review is to provide the reader with a complete tabulation of all of the variants that have been characterized, including melting temperatures, crystallographic data, Protein Data Bank access codes, and references to the original literature. The greatest increase in melting temperature (T(m)) for any point mutant is 5.1 degrees C for the mutant Ser 117 --> Val. This is achieved in part not only by hydrophobic stabilization but also by eliminating an unusually short hydrogen bond of 2.48 A that apparently has an unfavorable van der Waals contact. Increases in T(m) of more than 3-4 degrees C for point mutants are rare, whereas several different types of destabilizing substitutions decrease T(m) by 20 degrees C or thereabouts. The energetic cost of cavity creation and its relation to the hydrophobic effect, derived from early studies of "large-to-small" mutants in the core of T4 lysozyme, has recently been strongly supported by related studies of the intrinsic membrane protein bacteriorhodopsin. The L99A cavity in the C-terminal domain of the protein, which readily binds benzene and many other ligands, has been the subject of extensive study. Crystallographic evidence, together with recent NMR analysis, suggest that these ligands are admitted by a conformational change involving Helix F and its neighbors. A total of 43 nonisomorphous crystal forms of different monomeric lysozyme mutants were obtained plus three more for synthetically-engineered dimers. Among the 43 space groups, P2(1)2(1)2(1) and P2(1) were observed most frequently, consistent with the prediction of Wukovitz and Yeates.}, author = {Baase, W A and Liu, L and Tronrud, D E and Matthews, B W}, journal = {Protein Sci}, misc = { pmid = {20095051}, doi = {10.1002/pro.344}}, month = {apr}, number = {4}, pages = {631-641}, title = {Lessons from the lysozyme of phage T4}, url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2867005/}, volume = {19}, year = {2010} } @article{NunesAlves2018T4LysozymeUnbinding, abstract = {The T4 lysozyme L99A mutant is often used as a model system to study small-molecule binding to proteins, but pathways for ligand entry and exit from the buried binding site and the associated protein conformational changes have not been fully resolved. Here, molecular dynamics simulations were employed to model benzene exit from its binding cavity using the weighted ensemble (WE) approach to enhance sampling of low-probability unbinding trajectories. Independent WE simulations revealed four pathways for benzene exit, which correspond to transient tunnels spontaneously formed in previous simulations of apo T4 lysozyme. Thus, benzene unbinding occurs through multiple pathways partially created by intrinsic protein structural fluctuations. Motions of several α-helices and side chains were involved in ligand escape from metastable microstates. WE simulations also provided preliminary estimates of rate constants for each exit pathway. These results complement previous works and provide a semiquantitative characterization of pathway heterogeneity for binding of small molecules to proteins.}, author = {Nunes-Alves, Ariane and Zuckerman, Daniel M. and Arantes, Guilherme Menegon}, journal = {Biophysical Journal}, misc = { issn = {0006-3495}, doi = {https://doi.org/10.1016/j.bpj.2018.01.014}}, number = {5}, pages = {1058 - 1066}, title = {Escape of a Small Molecule from Inside T4 Lysozyme by Multiple Pathways}, url = {http://www.sciencedirect.com/science/article/pii/S0006349518301401}, volume = {114}, year = {2018} } @article{Wang2016LysozymeMetadynamicsUnbinding, author = {Wang, Yong and Papaleo, Elena and Lindorff-Larsen, Kresten}, journal = {Elife}, pages = {e17505}, publisher = {eLife Sciences Publications Limited}, title = {Mapping transiently formed and sparsely populated conformations on a complex energy landscape}, volume = {5}, year = {2016} } @article{Feher1996LysozymeBinding, author = {Feher, Victoria A and Baldwin, Enoch P and Dahlquist, Frederick W}, journal = {Nature structural biology}, number = {6}, pages = {516}, publisher = {Nature Publishing Group}, title = {Access of ligands to cavities within the core of a protein is rapid}, volume = {3}, year = {1996} } #+END_SRC ** Simulations :PROPERTIES: :TABLE_EXPORT_FILE: data/sim_management/jobs.csv :TABLE_EXPORT_FORMAT: orgtbl-to-csv :END: | gexp | span_id | jobid | segment_idx | success | status | |-----------+---------+----------+-------------+---------+-----------| | No | 0 | 49952117 | 0 | yes | COMPLETED | | No | 1 | 49952120 | 0 | yes | COMPLETED | | No | 2 | 49952123 | 0 | yes | COMPLETED | | No | 3 | 50036078 | 0 | yes | COMPLETED | | No_single | 0 | 50035864 | 0 | yes | COMPLETED | | No_single | 1 | 50035866 | 0 | yes | COMPLETED | | No_single | 2 | 50035867 | 0 | yes | COMPLETED | | No_single | 3 | 50035868 | 0 | yes | COMPLETED | | REVO | 0 | 49952118 | 0 | yes | COMPLETED | | REVO | 1 | 49952121 | 0 | yes | COMPLETED | | REVO | 2 | 49952124 | 0 | yes | COMPLETED | | REVO | 3 | 50036052 | 0 | yes | COMPLETED | | WExplore | 0 | 49952122 | 0 | yes | COMPLETED | | WExplore | 1 | 49952125 | 0 | yes | COMPLETED | | WExplore | 2 | 49952119 | 0 | yes | COMPLETED | | WExplore | 3 | 50036053 | 0 | yes | COMPLETED | |-----------+---------+----------+-------------+---------+-----------| ** Analysis *** Span Stats | | gexp | span_id | n_runs | n_warps | n_cycles | total_sampling_time_ps | warps_per_ps | rate_ps | total_warped_weight | warp_weight_mean | warp_weight_median | warp_weight_min | warp_weight_max | warp_wait-time_mean_ps | warp_wait-time_median_ps | warp_wait-time_min_ps | warp_wait-time_max_ps | |----+-----------+-----------+----------+-----------+------------+--------------------------+----------------+-------------+-----------------------+--------------------+----------------------+-------------------+-------------------+--------------------------+----------------------------+-------------------------+-------------------------| | 0 | No | 0 | 1 | 7 | 1101 | 1.05696e+06 | 6.62277e-06 | 1.37974e-07 | 0.145833 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 18317.1 | 17960 | 14580 | 21240 | | 1 | No | 1 | 1 | 21 | 4645 | 4.4592e+06 | 4.70936e-06 | 9.81118e-08 | 0.4375 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 48672.4 | 41940 | 9760 | 92120 | | 2 | No | 2 | 1 | 4 | 1107 | 1.06272e+06 | 3.76393e-06 | 7.84151e-08 | 0.0833333 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 14915 | 15830 | 6460 | 21540 | | 3 | No | 3 | 1 | 7 | 1139 | 1.09344e+06 | 6.40181e-06 | 1.33371e-07 | 0.145833 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 18940 | 19580 | 11420 | 22580 | | 4 | No_single | 0 | 1 | 2 | 42914 | 858280 | 2.33024e-06 | 2.33024e-06 | 2 | 1 | 1 | 1 | 1 | 759060 | 759060 | 748720 | 769400 | | 5 | No_single | 1 | 1 | 2 | 41619 | 832380 | 2.40275e-06 | 2.40275e-06 | 2 | 1 | 1 | 1 | 1 | 730970 | 730970 | 659480 | 802460 | | 6 | No_single | 2 | 1 | 4 | 42058 | 841160 | 4.75534e-06 | 4.75534e-06 | 4 | 1 | 1 | 1 | 1 | 120240 | 119320 | 44640 | 197680 | | 7 | No_single | 3 | 1 | 3 | 42642 | 852840 | 3.51766e-06 | 3.51766e-06 | 3 | 1 | 1 | 1 | 1 | 510460 | 478380 | 449320 | 603680 | | 8 | REVO | 0 | 1 | 1381 | 1142 | 1.09632e+06 | 0.00125967 | 3.6137e-08 | 0.0396177 | 2.86877e-05 | 5.29838e-11 | 1.00581e-12 | 0.0308164 | 13088 | 13760 | 1100 | 22780 | | 9 | REVO | 1 | 1 | 1458 | 1126 | 1.08096e+06 | 0.0013488 | 3.22975e-09 | 0.00349123 | 2.39453e-06 | 1.01315e-10 | 1.01126e-12 | 0.00152529 | 14434 | 14900 | 320 | 22500 | | 10 | REVO | 2 | 1 | 469 | 1103 | 1.05888e+06 | 0.000442921 | 3.23438e-13 | 3.42482e-07 | 7.3024e-10 | 5.8895e-12 | 1.0023e-12 | 2.54594e-08 | 12041.7 | 11660 | 340 | 21900 | | 11 | REVO | 3 | 1 | 1029 | 1100 | 1.056e+06 | 0.000974432 | 5.1951e-09 | 0.00548602 | 5.33141e-06 | 1.69055e-11 | 1.00155e-12 | 0.00320306 | 10512.9 | 9380 | 160 | 21980 | | 12 | WExplore | 0 | 1 | 70 | 1134 | 1.08864e+06 | 6.43004e-05 | 7.02838e-08 | 0.0765138 | 0.00109305 | 9.23911e-08 | 1.92453e-12 | 0.0387847 | 11624.3 | 11340 | 420 | 21720 | | 13 | WExplore | 1 | 1 | 85 | 1129 | 1.08384e+06 | 7.84249e-05 | 1.58479e-07 | 0.171766 | 0.00202078 | 1.91101e-08 | 4.27336e-12 | 0.0670075 | 10480.9 | 9360 | 920 | 22320 | | 14 | WExplore | 2 | 1 | 264 | 4585 | 4.4016e+06 | 5.99782e-05 | 8.50116e-09 | 0.0374187 | 0.000141738 | 1.07901e-07 | 1.04765e-12 | 0.0138983 | 47844.8 | 50220 | 1680 | 91560 | | 15 | WExplore | 3 | 1 | 67 | 1095 | 1.0512e+06 | 6.37367e-05 | 3.54706e-07 | 0.372867 | 0.00556517 | 5.52999e-07 | 1.84583e-12 | 0.357752 | 11421.2 | 11540 | 320 | 21300 | * Log Log of activities ** <2019-11-13 Wed> Notes for today... ** <2019-12-04 Wed> Finally finished simulations and I know that there were some good results from the dashboards. Now considering what my next goals are. I do want to do some exploratory stuff to make sure the integrity of the simulations is good. Then I can work on computing some observables and doing fe profiles. But first I will have to set up all the data retrieval and contig trees etc. - [ ] data retrieval functions - [ ] "experiment" keys - resampler conditions - [ ] contig trees - [ ] resampling tree visualization - [ ] terminal trajectories - [ ] exit points - [ ] end walkers - [ ] warping - [ ] stats table - [ ] rate plots - [ ] compute observables - [ ] ligand RMSD - [ ] protein RMSD - [ ] SASA - [ ] FE profile plots * COMMENT Scrapyard Things you don't want to throw away but you don't want to keep in the clean sections above. ** Scratch *** Example of running lysozyme #+begin_src python """ This is a simple script to run traditional molecular dynamics simulations """ import os from simtk import openmm, unit from simtk.openmm.app.pdbfile import PDBFile from simtk.openmm.app import statedatareporter from openmmtools import testsystems, integrators from sys import argv import timeit try: file_name = argv[1] total_steps = int(argv[2]) frame_period = int(argv[3]) except IndexError: print('Please enter three arguments: [file name] [number of steps] ' '[fraction of steps to print to dcd file]') raise try: platform_input = argv[4] except IndexError: platform_input = 'CPU' print('Inputs:' '\nFile prefix:', file_name, '\nTotal steps:', total_steps, '\nFrame period:', frame_period, '\nCPU or CUDA?:', platform_input) dcd_file_name = file_name + '.dcd' pdb_file_name = file_name + '.pdb' reporter_file_name = file_name + '.out' state_file_name = file_name + '_' + platform_input + '.bin' lysozyme_pxylene = testsystems.LysozymeImplicit() topology = lysozyme_pxylene.topology t4_system = lysozyme_pxylene.system # Use timestep of 2 fs new_timestep = unit.quantity.Quantity(2, unit.femtosecond) integrator = integrators.LangevinIntegrator(timestep=new_timestep) platform = openmm.Platform.getPlatformByName(platform_input) if platform_input == 'CUDA': # Settings for running on GPU's prop = dict(CudaPrecision='single') simulation = openmm.app.Simulation(topology, t4_system, integrator, platform, prop) else: # Presumably running on CPU's simulation = openmm.app.Simulation(topology, t4_system, integrator, platform) if os.path.exists(dcd_file_name): os.remove(dcd_file_name) positions = lysozyme_pxylene.positions # Only minimize the energy and save the updated atomic positions if it hasn't been done before if not os.path.exists(pdb_file_name): with open(pdb_file_name, 'w') as file: # MDTraj is preferred for this... PDBFile.writeFile(topology, positions, file=file) if os.path.exists(state_file_name): # Load the saved checkpoint of the simulation simulation.loadCheckpoint(state_file_name) print('Reusing previous checkpoint') else: # Otherwise re-setup the simulation print('Creating new checkpoint') simulation.context.setPositions(positions) simulation.context.setVelocitiesToTemperature(300*unit.kelvin) print('Minimizing...') # First minimize the energy simulation.minimizeEnergy() with open(state_file_name, 'w') as file: simulation.saveCheckpoint(state_file_name) simulation.reporters.append(openmm.app.DCDReporter(dcd_file_name, frame_period)) simulation.reporters.append( statedatareporter.StateDataReporter(reporter_file_name, frame_period, step=True, time=True, potentialEnergy=True, temperature=True, progress=True, remainingTime=True, speed=True, totalSteps=total_steps, separator='\t' ) ) print('Running Production...') start_time = timeit.default_timer() simulation.step(total_steps) finish_time = timeit.default_timer() print('Done.' '\nTotal steps:', total_steps, '\nFrame period:', frame_period, '\nSimulation runtime (seconds):', finish_time - start_time) #+end_src ** Analysis ** Invocation * COMMENT Local Variables # Local Variables: # mode: org # org-todo-keyword-faces: (("TODO" . org-warning) ("INPROGRESS" . "magenta") ("WAIT" . "orange") ("DONE" . org-done) ("CANCELLED" . org-done)) # org-table-export-default-format: orgtbl-to-csv # End:
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...