-*- mode: org; -*-
* Meta
#+TITLE: wepy.lysozyme_test
#+AUTHOR: Samuel D. Lotz
#+EMAIL: samuel.lotz@salotz.info
#+STARTUP: overview inlineimages
#+TODO: TODO | INPROGRESS WAIT | DONE CANCELLED
* Setup
** Environments
*** common
**** Pip
#+begin_src fundamental :tangle configs/common.requirements.txt
# interaction
ipython
pdbpp
# standard
numpy
pandas
scipy
sqlalchemy
matplotlib
joblib
click
networkx
tabulate
joblib
pint
multiprocessing_logging
# this project
-e .
# my libs
git+https://github.com/ADicksonLab/wepy.git
git+https://github.com/ADicksonLab/geomm.git
git+https://github.com/ADicksonLab/CSNAnalysis
git+https://github.com/salotz/slurmify
pint
#+end_src
**** Conda
#+begin_src fundamental :tangle configs/common.env.yaml
name: wepy.lysozyme_test.project
channels:
- conda-forge
- defaults
- omnia
dependencies:
- openmm
- openmmtools
#+end_src
*** common_dev
**** Pip
#+begin_src fundamental :tangle configs/common_dev.requirements.txt
# interaction
ipython
pdbpp
# tools
# currently this way doesn't work. Conda does. But I've had more success with the debian install
#dvc[all]
# standard
numpy
pandas
scipy
sqlalchemy
matplotlib
joblib
click
networkx
tabulate
joblib
pint
multiprocessing_logging
# this project
-e .
# my libs
-e ../../devel/wepy
-e ../../devel/geomm
-e ../../devel/CSNAnalysis
#+end_src
**** Conda
#+begin_src fundamental :tangle configs/common_dev.env.yaml
name: wepy.lysozyme_test.common_dev
channels:
- conda-forge
- defaults
- omnia
dependencies:
- openmm
- openmmtools
#+end_src
*** sims
**** Pip
#+begin_src fundamental :tangle configs/sims.requirements.txt
-e .
multiprocessing_logging
# my libs
git+https://github.com/ADicksonLab/wepy.git
git+https://github.com/ADicksonLab/geomm.git
#+end_src
**** Conda
#+begin_src fundamental :tangle configs/sims.env.yaml
name: wepy.lysozyme_test.project
channels:
- conda-forge
- defaults
- omnia/label/cuda100
- omnia
dependencies:
- openmm
- openmmtools
#+end_src
*** sims_dev
**** Pip
#+begin_src fundamental :tangle configs/sims_dev.requirements.txt
-e .
multiprocessing_logging
# my libs
-e ../../devel/wepy
git+https://github.com/ADicksonLab/geomm.git
#+end_src
**** Conda
#+begin_src fundamental :tangle configs/sims_dev.env.yaml
name: wepy.lysozyme_test.sims_dev
channels:
- conda-forge
- defaults
- omnia/label/cuda100
- omnia
dependencies:
- openmm
- openmmtools
#+end_src
* Scratch
For scratch work. Ephemeral.
** <2019-12-04 Wed>
#+begin_src python
from wepy_lysozyme_test._tasks import *
from wepy.hdf5 import WepyHDF5
get_job_hdf5_path('49952117')
h5_path = get_job_hdf5_path('49952117')
wepy_h5 = WepyHDF5(h5_path, mode='r')
wepy_h5.open()
#+end_src
** <2019-12-05 Thu>
#+begin_src python
from wepy.boundary_conditions.receptor import UnbindingBC
from wepy_lysozyme_test._tasks import *
contig = get_gexp_span_contig('No', 0)
layout_graph = contig.resampling_tree_layout_graph(
bc_class=UnbindingBC,
progress_key='min_distances'
)
#+end_src
** <2019-12-06 Fri>
#+begin_src python
from wepy_lysozyme_test._tasks import *
save_warp_lineages_dcd('No', 0)
#+end_src
** <2019-12-09 Mon>
#+begin_src python
from wepy_lysozyme_test._tasks import *
jobs_df = get_jobs_df()
spans_df = copy(jobs_df).drop(columns=['success', 'status', 'jobid',])
spans_df = spans_df.groupby(['gexp', 'span_id']).count().reset_index().rename(
columns={'segment_idx' : 'n_segments'})
contig0 = get_gexp_span_contig('No', 0)
contig1 = get_gexp_span_contig('No', 1)
#+end_src
** <2019-12-10 Tue>
Rate plots
#+begin_src python
from wepy_lysozyme_test._tasks import *
gexp_plot_rates_rt('WExplore')
#+end_src
* Configurations
The place to put all configuration type data that will go into
parameterizing other productions and analyses.
* Workflow
A (mostly) ordered outline of how to proceed from beginning to end.
** HPCC Interactive Jobs
#+begin_src bash
srun -N 1 -c 1 --mem=15G --time=48:00:00 -C intel16 --gres=gpu:1 --pty "/bin/bash"
#+end_src
** Standalone Simulations
To validate that our system is okay we do a couple of standalone runs
on the system.
*** Script
Here is a script for running a basic kind of simulation:
#+begin_src python :tangle scripts/run.py
import simtk.openmm as omm
import simtk.openmm.app as omma
import simtk.unit as unit
import mdtraj as mdj
from openmmtools import testsystems
# from seh_prep.modules import StateDataFrameOMMReporter
STEP_TIME = 0.002 * unit.picoseconds
LANGEVIN_DEFAULTS = (
300.0*unit.kelvin,
1/unit.picosecond,
STEP_TIME
)
PLATFORM = 'CPU'
FRAME_INTERVAL_TIME = 10 * unit.picosecond
FRAME_INTERVAL = int(round(FRAME_INTERVAL_TIME / STEP_TIME))
def gen_sim(platform):
integrator = omm.LangevinIntegrator(*LANGEVIN_DEFAULTS)
test_sys = testsystems.LysozymeImplicit()
# write a PDB file for reference
top = mdj.Topology.from_openmm(test_sys.topology)
traj = mdj.Trajectory(test_sys.positions.value_in_unit(unit.nanometer), topology=top)
traj.save_pdb('ref.pdb')
if platform == 'CUDA':
platform_kwargs = {'Precision' : 'single'}
elif platform == 'OpenCL':
platform_kwargs = {"Precision" : 'single'}
elif platform == 'CPU':
platform_kwargs = {}
elif platform == 'Reference':
platform_kwargs = {}
else:
raise ValueError
platform = omm.Platform.getPlatformByName(platform)
simulation = omma.Simulation(test_sys.topology, test_sys.system, integrator,
platform=platform,
platformProperties=platform_kwargs)
simulation.context.setPositions(test_sys.positions)
# add some reporters
simulation.reporters.append(omma.DCDReporter('traj.dcd',
reportInterval=FRAME_INTERVAL,
enforcePeriodicBox=False))
# simulation.reporters.append(StateDataFrameOMMReporter(report_interval=10))
simulation.reporters.append(omma.StateDataReporter('state.csv',
FRAME_INTERVAL,
step=True,
time=True,
potentialEnergy=True,
kineticEnergy=True,
totalEnergy=True,
temperature=True,
volume=True,
))
return simulation
if __name__ == "__main__":
import sys
import time
# time in sampling to run
sim_time = float(sys.argv[1]) #* unit.second
platform = sys.argv[2]
print("Running simulation for {}".format(sim_time))
tranche_time = 10 * unit.picosecond
tranche_steps = int(round(tranche_time / STEP_TIME))
print(f"running tranches of {tranche_steps} steps")
simulation = gen_sim(platform)
print("Starting the simulation")
curr_time = 0.
step_counter = 0
start_time = time.time()
while curr_time < sim_time:
print("starting from step {}".format(step_counter))
print(type(tranche_steps))
simulation.step(tranche_steps)
curr_time = time.time() - start_time
step_counter += tranche_steps
print("Ran {}".format(step_counter))
print("Time currently is: {}".format(curr_time))
# add up the sampling time
sampling_time = step_counter * STEP_TIME
print(f"Ran a total of {step_counter} steps")
print(f"Ran a total of {sampling_time}")
#+end_src
*** run settings
#+BEGIN_SRC toml :tangle hpcc/standalone/run_settings.toml
# test run value
walltime = "24:00:00"
memory = "20gb"
num_cpus = 2
# test run value
num_gpus = 1
constraint = "[intel18|intel16]"
#+END_SRC
*** context settings
#+BEGIN_SRC toml :tangle hpcc/standalone/context_settings.toml
[context]
# its okay to leave out the exec dir, but you can specify this and the
#run will get executed here and then be moved back
# exec_dir = "$SCRATCH/exec"
# epilog is something that can be run after the run, if it is empty
# here that is okay, this is really only important for moving data
# back after being executed somewhere else
epilog = ""
# this section is for the setup of the run in the script
[context.setup]
# this is the parent dir for the jobs, there should be a directory
# here called 'jobs' and each job will will get it's own unique
# directory
goal_dir_path = "$SCRATCH/tree/lab/projects/wepy.lysozyme_test/hpcc/standalone"
# will load these modules from lmod i.e. 'module load '
lmod_modules = ["CUDA/10.0.130"]
# everything in here is translated directly into environment
# variables, e.g. "export ANACONDA_DIR=path/to/anaconda"
[context.setup.env_vars]
ANACONDA_DIR = "/mnt/home/lotzsamu/anaconda3"
#+END_SRC
*** Tasks
**** 1
#+begin_src bash :shebang "#!/bin/bash --login" :tangle hpcc/standalone/tasks/initial.sh
source $HOME/.bashrc
# use the ANACONDA_DIR which is set in the environment variables in
# the setup to setup this shell for anaconda
. ${ANACONDA_DIR}/etc/profile.d/conda.sh
env="wepy.lysozyme_test.sims"
# activate the proper virtualenv
conda activate $env
# this will run for under 24 hours
script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/run.py"
${ANACONDA_DIR}/envs/$env/bin/python $script 84000 CUDA
#+end_src
*** Retrieving results
#+begin_src bash
rsync -ravvhhiz \
lotzsamu@rsync.hpcc.msu.edu:/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/standalone/jobs/ \
$HOME/tree/lab/projects/wepy.lysozyme_test/hpcc/standalone/jobs
#+end_src
** Test Wepy Simulations
Run very short local simulations for debugging.
*** Run a very short simulation to get some data
#+begin_src bash
./scripts/noresampler_local_test.sh
#+end_src
**** Script
#+begin_src bash :shebang "#!/bin/bash" :tangle scripts/noresampler_local_test.sh
# use the ANACONDA_DIR which is set in the environment variables in
# the setup to setup this shell for anaconda
. ${ANACONDA_DIR}/etc/profile.d/conda.sh
env='wepy.lysozyme_test.sims_dev'
home_root="$HOME"
# activate the proper virtualenv
conda activate $env
# test our openmm installation
${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation
# get into the right dir and copy the inputs etc.
rm -rf tmp/test_wepy_sim
mkdir -p tmp/test_wepy_sim
cd tmp/test_wepy_sim
# run the simulation
N_CYCLES=10
N_STEPS=10
N_WALKERS=5
N_WORKERS=1
PLATFORM="CPU"
RESAMPLER="REVOResampler"
script="$home_root/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_cycles.py"
${ANACONDA_DIR}/envs/$env/bin/python $script \
$N_CYCLES \
$N_STEPS \
$N_WALKERS \
$N_WORKERS \
$PLATFORM \
$RESAMPLER
#+end_src
** Wepy Simulations
To validate that our system is okay we do a couple of standalone runs
on the system.
*** By Time
Here is a script for running a basic kind of simulation:
#+begin_src python :tangle scripts/wepy_run_by_time.py
if __name__ == "__main__":
import sys
import logging
from multiprocessing_logging import install_mp_handler
from wepy_tools.sim_makers.openmm.lysozyme import LysozymeImplicitOpenMMSimMaker
logging.getLogger().setLevel(logging.DEBUG)
install_mp_handler()
if sys.argv[1] == "-h" or sys.argv[1] == "--help":
print("arguments: n_cycles, n_steps, n_walkers, n_workers, platform, resampler")
else:
walltime = int(sys.argv[1])
#n_cycles = int(sys.argv[1])
n_steps = int(sys.argv[2])
n_walkers = int(sys.argv[3])
n_workers = int(sys.argv[4])
platform = sys.argv[5]
resampler = sys.argv[6]
print("Number of steps: {}".format(n_steps))
print("Time to run: {}".format(walltime))
# print("Number of cycles: {}".format(n_cycles))
sim_maker = LysozymeImplicitOpenMMSimMaker()
apparatus = sim_maker.make_apparatus(
integrator='LangevinIntegrator',
resampler=resampler,
bc='UnbindingBC',
platform=platform,
)
config = sim_maker.make_configuration(apparatus,
work_mapper='TaskMapper',
platform=platform)
sim_manager = sim_maker.make_sim_manager(n_walkers, apparatus, config)
sim_manager.run_simulation_by_time(walltime, n_steps, num_workers=n_workers)
#+end_src
*** By Num Cycles
Here is a script for running a basic kind of simulation:
#+begin_src python :tangle scripts/wepy_run_by_cycles.py
if __name__ == "__main__":
import sys
import logging
from multiprocessing_logging import install_mp_handler
from wepy_tools.sim_makers.openmm.lysozyme import LysozymeImplicitOpenMMSimMaker
logging.getLogger().setLevel(logging.DEBUG)
install_mp_handler()
if sys.argv[1] == "-h" or sys.argv[1] == "--help":
print("arguments: n_cycles, n_steps, n_walkers, n_workers, platform, resampler")
else:
n_cycles = int(sys.argv[1])
n_steps = int(sys.argv[2])
n_walkers = int(sys.argv[3])
n_workers = int(sys.argv[4])
platform = sys.argv[5]
resampler = sys.argv[6]
print("Number of steps: {}".format(n_steps))
print("Number of cycles: {}".format(n_cycles))
sim_maker = LysozymeImplicitOpenMMSimMaker()
apparatus = sim_maker.make_apparatus(
integrator='LangevinIntegrator',
resampler=resampler,
bc='UnbindingBC',
platform=platform,
)
config = sim_maker.make_configuration(apparatus,
work_mapper='TaskMapper',
platform=platform)
sim_manager = sim_maker.make_sim_manager(n_walkers, apparatus, config)
sim_manager.run_simulation_by_time(n_cycles, n_steps, num_workers=n_workers)
#+end_src
*** run settings
**** Full GPU Node, Week
#+BEGIN_SRC toml :tangle hpcc/wepy/configs/GPU-8_168-hrs.run_settings.toml
# test run value
walltime = "168:00:00"
memory = "20gb"
num_cpus = 4
num_gpus = 8
constraint = "[intel18|intel16]"
#+END_SRC
**** Single GPU Node, Week
#+BEGIN_SRC toml :tangle hpcc/wepy/configs/GPU-1_168-hrs.run_settings.toml
# test run value
walltime = "168:00:00"
memory = "10gb"
num_cpus = 4
num_gpus = 1
constraint = "[intel18|intel16]"
#+END_SRC
*** context settings
#+BEGIN_SRC toml :tangle hpcc/wepy/configs/context_settings.toml
[context]
# its okay to leave out the exec dir, but you can specify this and the
#run will get executed here and then be moved back
# exec_dir = "$SCRATCH/exec"
# epilog is something that can be run after the run, if it is empty
# here that is okay, this is really only important for moving data
# back after being executed somewhere else
epilog = ""
# this section is for the setup of the run in the script
[context.setup]
# this is the parent dir for the jobs, there should be a directory
# here called 'jobs' and each job will will get it's own unique
# directory
goal_dir_path = "$SCRATCH/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy"
# will load these modules from lmod i.e. 'module load '
lmod_modules = ["CUDA/10.0.130"]
# everything in here is translated directly into environment
# variables, e.g. "export ANACONDA_DIR=path/to/anaconda"
[context.setup.env_vars]
ANACONDA_DIR = "/mnt/home/lotzsamu/anaconda3"
#+END_SRC
*** Tasks
**** NoResampler single-walker
#+begin_src bash :shebang "#!/bin/bash" :tangle hpcc/wepy/tasks/noresampler_single.sh
# use the ANACONDA_DIR which is set in the environment variables in
# the setup to setup this shell for anaconda
. ${ANACONDA_DIR}/etc/profile.d/conda.sh
env="wepy.lysozyme_test.sims"
# activate the proper virtualenv
conda activate $env
${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation
WALLTIME=590400
N_STEPS=10000 # 20 picoseconds
N_WALKERS=1
N_WORKERS=1
PLATFORM="CUDA"
RESAMPLER="NoResampler"
script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_time.py"
${ANACONDA_DIR}/envs/$env/bin/python $script \
$WALLTIME \
$N_STEPS \
$N_WALKERS \
$N_WORKERS \
$PLATFORM \
$RESAMPLER
#+end_src
**** NoResampler ensemble
#+begin_src bash :shebang "#!/bin/bash" :tangle hpcc/wepy/tasks/noresampler_ensemble.sh
# use the ANACONDA_DIR which is set in the environment variables in
# the setup to setup this shell for anaconda
. ${ANACONDA_DIR}/etc/profile.d/conda.sh
env="wepy.lysozyme_test.sims"
# activate the proper virtualenv
conda activate $env
${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation
WALLTIME=590400
N_STEPS=10000 # 20 picoseconds
N_WALKERS=48
N_WORKERS=8
PLATFORM="CUDA"
RESAMPLER="NoResampler"
script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_time.py"
${ANACONDA_DIR}/envs/$env/bin/python $script \
$WALLTIME \
$N_STEPS \
$N_WALKERS \
$N_WORKERS \
$PLATFORM \
$RESAMPLER
#+end_src
**** WExplore
#+begin_src bash :shebang "#!/bin/bash" :tangle hpcc/wepy/tasks/wexplore.sh
# use the ANACONDA_DIR which is set in the environment variables in
# the setup to setup this shell for anaconda
. ${ANACONDA_DIR}/etc/profile.d/conda.sh
env="wepy.lysozyme_test.sims"
# activate the proper virtualenv
conda activate $env
${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation
WALLTIME=590400
N_STEPS=10000 # 20 picoseconds
N_WALKERS=48
N_WORKERS=8
PLATFORM="CUDA"
RESAMPLER="WExploreResampler"
script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_time.py"
${ANACONDA_DIR}/envs/$env/bin/python $script \
$WALLTIME \
$N_STEPS \
$N_WALKERS \
$N_WORKERS \
$PLATFORM \
$RESAMPLER
#+end_src
**** REVO
#+begin_src bash :shebang "#!/bin/bash" :tangle hpcc/wepy/tasks/revo.sh
# use the ANACONDA_DIR which is set in the environment variables in
# the setup to setup this shell for anaconda
. ${ANACONDA_DIR}/etc/profile.d/conda.sh
env="wepy.lysozyme_test.sims"
# activate the proper virtualenv
conda activate $env
${ANACONDA_DIR}/envs/$env/bin/python -m simtk.testInstallation
WALLTIME=590400
N_STEPS=10000 # 20 picoseconds
N_WALKERS=48
N_WORKERS=8
PLATFORM="CUDA"
RESAMPLER="REVOResampler"
script="$SCRATCH/tree/lab/projects/wepy.lysozyme_test/scripts/wepy_run_by_time.py"
${ANACONDA_DIR}/envs/$env/bin/python $script \
$WALLTIME \
$N_STEPS \
$N_WALKERS \
$N_WORKERS \
$PLATFORM \
$RESAMPLER
#+end_src
*** Running the simulations
Make sure to push the scripts to HPCC:
#+begin_src bash
inv custom.push-scripts
#+end_src
Generate the scripts on HPCC:
#+begin_src bash
inv custom.hpcc-gen-wepy-submissions
#+end_src
Then submit them into slurm:
#+begin_src bash
(cd hpcc/wepy && sbatch submissions/*)
#+end_src
*** Post-Simulation
Then retrieve the results from HPCC.
#+begin_src bash
rsync -ravvhhiz \
lotzsamu@rsync.hpcc.msu.edu:/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs/ \
$HOME/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs
#+end_src
Pop the links out to resources should be done on HPCC first so they
are both in sync, but this shows it being done locally.
#+begin_src bash
symlink-pop /home/salotz/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs \
/home/salotz/tree/lab/resources/project-resources/wepy.lysozyme_test/hpcc/wepy/jobs
#+end_src
Then back them up to volta
Just doing the whole tree since that needs to be done anyways.
#+begin_src bash
inv refugue.rsync --source superior --target volta
#+end_src
Backup to allegheny
#+begin_src bash
inv refugue.rsync --source superior --target allegheny
#+end_src
** Preprocessing
*** Link Results by Experimental groups
#+begin_src bash
python -m wepy_lysozyme_test.execution.link_results
#+end_src
** Analyze wepy simulations
*** resampling tree
#+begin_src bash
python -m wepy_lysozyme_test.execution.gen_restrees 'WExplore'
#+end_src
#+begin_src bash
python -m wepy_lysozyme_test.execution.gen_restrees 'REVO'
#+end_src
*** terminal lineage trajectories
**** Warping Lineages
To get them all in one trajectory file:
#+begin_src bash
(for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do
python -m wepy_lysozyme_test.execution.gen_warp_lineages $gexp || exit 1
done)
#+end_src
*** Contig Stats: length, warping, rates
#+begin_src bash
python -m wepy_lysozyme_test.execution.gen_contig_stats_table
#+end_src
*** Rate Plots
#+begin_src bash
(for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do
python -m wepy_lysozyme_test.execution.show_rate_plots $gexp || exit 1
done)
#+end_src
*** TODO compute observables
*** TODO free energy plots
* Analysis
A structured approach to running analytics tasks in python.
In the "Tasks" section you write pure functions to do stuff.
In the workflows you can build up workflows of the tasks in Prefect.
In the execution you write the "scripts" that actually run them.
** Tasks
Define the units of work that you want done here.
*** Initialization
**** Header
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
"""Generated file from the analysis.org file. Do not edit directly."""
#+END_SRC
**** Imports
Imported modules that will be available to all tasks
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
# standard library
import os
import os.path as osp
import pickle
from copy import copy, deepcopy
import itertools as it
from collections import defaultdict
from pathlib import Path
from operator import attrgetter
# de facto standard library
import numpy as np
import pandas as pd
import sqlalchemy as sqla
import matplotlib.pyplot as plt
import networkx as nx
from tabulate import tabulate
# extra non-domain specific
import joblib
import pint
import simtk.unit as tkunit
#+END_SRC
**** Configuration
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
PROJECT_PATH = "$HOME/tree/lab/projects/wepy.lysozyme_test"
#+end_src
**** Paths
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
## Paths
# for localizing paths to very commonly used resources and resrouces
# which may change schema. The directory structure for the rest is the
# schema, so just use osp.join(project_path(), 'subpath/to/resource')
# for the rest so a lot of work is reduced in specifying all of them
def project_path():
return Path(osp.expanduser(osp.expandvars(PROJECT_PATH)))
def data_path():
return project_path() / 'data'
def db_path():
return project_path() / 'db'
def media_path():
return project_path() / 'media'
def scratch_path():
return project_path() / 'scratch'
def scripts_path():
return project_path() / 'scripts'
def src_path():
return project_path() / 'src'
def tmp_path():
return project_path() / 'tmp'
def troubleshoot_path():
return project_path() / 'troubleshoot'
# specific things
def sqlite_path():
return project_path() / 'db/db.sqlite'
def joblib_cache_path():
return project_path() / 'cache/joblib'
#+END_SRC
**** Setup
***** Joblib Caching
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
## Setup
# set up the joblib cache
jlmem = joblib.Memory(joblib_cache_path())
#+END_SRC
***** Recursion limit
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
RECURSION_LIMIT = 1000000
# set this when you want to do some recursion stuff with contigtrees
def _set_recursion_limit(recursion_limit=RECURSION_LIMIT):
import sys; sys.setrecursionlimit(recursion_limit)
print("Setting recursion limit to {}".format(recursion_limit))
# set the recursion depth since it is always needing to be increased
_set_recursion_limit()
#+END_SRC
***** Units: pint
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
UNITS = pint.UnitRegistry()
#+END_SRC
***** COMMENT SQlite
**** Data: Read & Write
These functions infer the type of file you want to write based on the
file extension.
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
def init_dir(dir_path):
dir_path.mkdir(exist_ok=True, parents=True)
def load_obj(filepath):
fname = osp.basename(filepath)
# use the file extension for how to load it
if fname.endswith('jl.pkl'):
# it is a joblib object so use joblib to load it
with open(filepath, 'rb') as rf:
obj = joblib.load(rf)
elif fname.endswith('pkl'):
# it is a pickle object so use joblib to load it
with open(filepath, 'rb') as rf:
obj = pickle.load(rf)
return obj
def save_obj(obj_path, obj, overwrite=False, ext='jl.pkl'):
import os
import os.path as osp
import pickle
import joblib
if ext == 'jl.pkl':
pickler_dump = joblib.dump
elif ext == 'pkl':
pickler_dump = pickle.dump
else:
raise ValueError("Must choose an extension for format selection")
# if we are not overwriting check if it exists
if not overwrite:
if osp.exists(obj_path):
raise OSError("File exists ({}), not overwriting".format(obj_path))
# otherwise make sure the directory exists
os.makedirs(osp.dirname(obj_path), exist_ok=True)
# it is a joblib object so use joblib to load it
with open(obj_path, 'wb') as wf:
pickler_dump(obj, wf)
def load_table(filepath):
import os.path as osp
import pandas as pd
fname = osp.basename(filepath)
# use the file extension for how to load it
if fname.endswith('csv'):
df = pd.read_csv(filepath, index_col=0)
elif fname.endswith('pkl'):
df = pd.read_pickle(filepath)
else:
raise ValueError("extension not supported")
return df
def save_table(table_path, df, overwrite=False, ext='csv'):
import os
import os.path as osp
import pickle
import pandas as pd
# if we are not overwriting check if it exists
if not overwrite:
if osp.exists(table_path):
raise OSError("File exists ({}), not overwriting".format(table_path))
# otherwise make sure the directory exists for this observable
os.makedirs(osp.dirname(table_path), exist_ok=True)
if ext == 'csv':
df.to_csv(table_path)
elif ext == 'pkl':
df.to_pickle(table_path)
else:
raise ValueError("extension not supported")
#+end_src
*** Parameters
The parameters and constants for the pipeline.
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
## Parameters
# number of individual steps per cycle in simulations
NUM_CYCLE_STEPS = 10000
STEP_TIME = 2 * tkunit.femtosecond
CYCLE_TIME = NUM_CYCLE_STEPS * STEP_TIME
# we could go in and get these values from the sim maker itself, but
# we'll keep it simple since this is the answer
WE_PMIN = 1e-12
## Visualization
# Resampling trees
PROGRESS_KEY = 'min_distances'
MAX_PROGRESS_NORM = 1.0
DISCONTINUOUS_NODE_SHAPE = 'square'
DEFAULT_NODE_SHAPE = 'disc'
COLORMAP_NAME = 'plasma'
#+END_SRC
**** Experimental Constants
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
EXPERIMENTAL_MFPT_UNIT = tkunit.minute
# TODO: in the paper the unit is written "x 10^{-4} s^{-1}"
EXPERIMENTAL_RATE_UNIT = None
EXPERIMENTAL_RATE = 950 * (1/tkunit.second)
EXPERIMENTAL_MFPT = 1 / EXPERIMENTAL_RATE
EXPERIMENTAL_RT = EXPERIMENTAL_MFPT
#+END_SRC
*** Data
**** Reference System
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
def get_sim_maker():
from wepy_tools.sim_makers.openmm.lysozyme import LysozymeImplicitOpenMMSimMaker
return LysozymeImplicitOpenMMSimMaker()
def get_ref_state():
return get_sim_maker().init_state
def get_distance_func():
return get_sim_maker().distance
def get_ref_image():
maker = get_sim_maker()
# apply the distance metric image function to the reference state
return maker.distance.image(maker.init_state)
def get_topology():
return get_sim_maker().json_top()
#+end_src
**** Selection Idxs and Tops
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
# just for reference all the keys we want to fill in
SELECTION_KEYS = ('all_atoms/ligand',
'all_atoms/protein',
'all_atoms/binding_site',
'all_atoms/image',
'image/ligand',
'image/binding_site',
'image/protein',
)
TOP_KEYS = ('all_atoms', 'image', 'ligand')
@jlmem.cache
def get_selection_idxs_tops():
""" compute both at once for convenience """
import numpy as np
import mdtraj as mdj
from wepy.util.json_top import json_top_subset
from wepy.util.mdtraj import (
mdtraj_to_json_topology,
json_to_mdtraj_topology,
traj_fields_to_mdtraj
)
# fill these in
sel_tops = {}
sel_idxs = {}
### Initialize
sim_maker = get_sim_maker()
BC = 'UnbindingBC'
BC_PARAMS = sim_maker.DEFAULT_BC_PARAMS[BC]
ligand_idxs = list(sim_maker.ligand_idxs())
protein_idxs = list(sim_maker.receptor_idxs())
binding_site_idxs = list(sim_maker.binding_site_idxs(BC_PARAMS['cutoff_distance']))
image_idxs = list(sim_maker.distance._image_idxs)
# the initial state
init_state = get_ref_state()
init_box_vectors = init_state['box_vectors'] # * init_state.box_vectors_unit
# the full topology
all_atoms_top = get_topology()
### All atoms
sel_tops['all_atoms'] = all_atoms_top
# the all atoms subselections
sel_idxs['all_atoms/ligand'] = ligand_idxs
sel_idxs['all_atoms/protein'] = protein_idxs
sel_idxs['all_atoms/binding_site'] = binding_site_idxs
### Image
# get the topology for this
image_top = json_top_subset(all_atoms_top, image_idxs)
sel_tops['image'] = image_top
sel_idxs['all_atoms/image'] = image_idxs
sel_idxs['image/ligand'] = [image_idxs.index(i)
for i in ligand_idxs
if i in image_idxs]
sel_idxs['image/protein'] = [image_idxs.index(i)
for i in protein_idxs
if i in image_idxs]
sel_idxs['image/binding_site'] = sel_idxs['image/protein']
### Ligand
ligand_top = json_top_subset(all_atoms_top, sel_idxs['all_atoms/ligand'])
sel_tops['ligand'] = ligand_top
return sel_idxs, sel_tops
def get_selection_idxs():
sels, tops = get_selection_idxs_tops()
return sels
def get_selection_tops():
sels, tops = get_selection_idxs_tops()
return tops
# make a dataframe summarizing the counts of different selections for
# each ligand
def selection_df():
from collections import defaultdict
selection_df = defaultdict(list)
selections = get_selection_idxs()
for selection_key, idxs in selections.items():
selection_df[selection_key].append(len(idxs))
selection_df = pd.DataFrame(selection_df)
return selection_df
#+END_SRC
**** Atom Selection Trajs
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
def get_ref_state_traj_fields(rep_key='main_rep'):
import numpy as np
selection_idxs = get_selection_idxs()
tops = get_selection_tops()
ref_state = get_ref_state()
ref_state_traj_fields = {}
if rep_key is None or rep_key == 'all_atoms':
positions = ref_state['positions']
else:
positions = ref_state['positions'][
selection_idxs['all_atoms/{}'.format(rep_key)]]
ref_state_traj_fields['positions'] = np.array([positions])
ref_state_traj_fields['box_vectors'] = np.array([ref_state['box_vectors']])
return ref_state_traj_fields
@jlmem.cache
def ref_selection_trajs():
import numpy as np
from wepy.util.mdtraj import traj_fields_to_mdtraj
selection_idxs = get_selection_idxs()
tops = get_selection_tops()
ref_state = get_ref_state()
ref_state_fields = {key : np.array([value]) for key, value in ref_state.dict().items()}
# make fields for the different representations
ref_state_fields['image'] = ref_state['positions'][selection_idxs['all_atoms/image']]
ref_state_fields['ligand'] = ref_state['positions'][selection_idxs['all_atoms/ligand']]
# all atoms
all_atoms_traj = traj_fields_to_mdtraj(ref_state_fields, tops['all_atoms'],
rep_key='positions')
# image
image_traj = traj_fields_to_mdtraj(ref_state_fields, tops['image'],
rep_key='image')
# ligand
ligand_traj = traj_fields_to_mdtraj(ref_state_fields, tops['ligand'],
rep_key='ligand')
ref_selections = {}
ref_selections['all_atoms'] = all_atoms_traj
ref_selections['image'] = image_traj
ref_selections['ligand'] = ligand_traj
return ref_selections
#+END_SRC
**** Experimental Groups and Jobs
- gexp :: short for "experimental group", and perhaps a pun on
"experimental-group expression" since we organize each
experimental group by a singular structured name
The experimental conditions for each simulation are specified here
with reference to the job id for each of them.
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
EXPERIMENTAL_GROUPS = ('No', 'No_single', 'REVO', 'WExplore',)
# the parameters used for each test group
from wepy.resampling.resamplers.resampler import NoResampler
from wepy.resampling.resamplers.revo import REVOResampler
from wepy.resampling.resamplers.wexplore import WExploreResampler
from wepy.resampling.decisions.decision import NoDecision
from wepy.resampling.decisions.clone_merge import MultiCloneMergeDecision
GEXP_METADATA = {
'No' : {
'resampler' : NoResampler,
'decision' : NoDecision,
'num_walkers' : 48,
},
'No_single' : {
'resampler' : NoResampler,
'decision' : NoDecision,
'num_walkers' : 1,
},
'REVO' : {
'resampler' : REVOResampler,
'decision' : MultiCloneMergeDecision,
'num_walkers' : 48,
},
'WExplore' : {
'resampler' : WExploreResampler,
'decision' : MultiCloneMergeDecision,
'num_walkers' : 48,
},
}
def get_jobs_df():
jobs_path = data_path() / 'sim_management/jobs.csv'
jobs_df = pd.read_csv(jobs_path)
return jobs_df
def get_gexp_jobs():
jobs_df = get_jobs_df()
gexp_jobs = {}
for gexp, grp_df in jobs_df.groupby('gexp'):
gexp_jobs[gexp] = list(grp_df['jobid'].values)
return gexp_jobs
def get_gexp_paths():
"""Get a dictionary of all of the paths for the jobs dirs for each
experimental group."""
gexp_jobs = get_gexp_jobs()
sims_dir = data_path() / 'sims/wepy/jobs'
gexp_paths = defaultdict(list)
for grp_name, jobids in gexp_jobs.items():
for job_id in jobids:
path = sims_dir / str(job_id)
gexp_paths[grp_name].append(path)
return gexp_paths
#+end_src
**** WepyHDF5s
Raw access to WepyHDF5s for each experimental group and the paths to
the sample files.
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
def get_job_hdf5_path(jobid):
return data_path() / f'sims/wepy/jobs/{jobid}/root.wepy.h5'
def get_linker_hdf5_path(gexp):
return data_path() / f'results/{gexp}.wepy.h5'
def link_gexp_hdf5s(gexp):
from wepy.hdf5 import WepyHDF5
jobids = get_gexp_jobs()[gexp]
h5_paths = [get_job_hdf5_path(jobid) for jobid in jobids]
# get a template wepy_hdf5 file
template_h5 = WepyHDF5(h5_paths[0], mode='r')
# make the linker file
linker_h5_path = get_linker_hdf5_path(gexp)
with template_h5:
linker_h5 = template_h5.clone(linker_h5_path, mode='w')
# then link all the job files to it
with linker_h5:
job_run_idxs = {}
for jobid, h5_path in zip(jobids, h5_paths):
# link the run from the file
new_run_idx = linker_h5.link_run(h5_path, 0)
job_run_idxs[jobid] = new_run_idx
# set the jobid as a run attr
linker_h5.run(new_run_idx).attrs['jobid'] = jobid
return linker_h5_path
def get_gexp_wepy_h5(gexp):
from wepy.hdf5 import WepyHDF5
path = get_linker_hdf5_path(gexp)
return WepyHDF5(path, mode='r')
def get_all_wepy_h5s():
d = {}
for gexp in EXPERIMENTAL_GROUPS:
d[gexp] = get_gexp_wepy_h5(gexp)
return d
def get_gexp_jobs_to_runs_map(gexp):
"""Get the runs in the HDF5 for the gexp that are mapped to by the
jobids."""
wepy_h5 = get_gexp_wepy_h5(gexp)
jobs_runs_d = {}
with wepy_h5:
for run_idx in wepy_h5.run_idxs:
run_grp = wepy_h5.run(run_idx)
jobid = run_grp.attrs['jobid']
run_idx = run_grp.attrs['run_idx']
jobs_runs_d[jobid] = run_idx
return jobs_runs_d
def get_gexp_span_ids(gexp):
jobs_df = get_jobs_df()
gexp_jobs_df = jobs_df[jobs_df['gexp'] == gexp]
span_ids = sorted(list(set(gexp_jobs_df['span_id'].values)))
return span_ids
def get_gexp_span_ids_run_idxs(gexp):
"""Get a mapping of the span_ids (names given to them) to the run idxs
(in order of the segments in the span via 'segment_idxs').
The span_ids and segment_idxs come from the jobs_df table.
"""
jobs_df = get_jobs_df()
gexp_jobs_df = jobs_df[jobs_df['gexp'] == gexp]
# get the mapping of jobids -> h5 run idxs
jobs_runs_d = get_gexp_jobs_to_runs_map(gexp)
# then go through each span and get the run idxs for it
spans_runs = {}
for span_id, span_df in gexp_jobs_df.groupby('span_id'):
# make records for each segment so that we can sort them
span_segs = []
for idx, row in span_df.iterrows():
seg_rec = (row['segment_idx'], row['jobid'])
span_segs.append(seg_rec)
# sort them, dereference the run_idx
span_run_idxs = [jobs_runs_d[jobid]
for seg_idx, jobid in sorted(span_segs)]
spans_runs[span_id] = span_run_idxs
return spans_runs
#+END_SRC
**** Contig Trees
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
# the base contigtree is the one that can be cached not the one with
# the HDF5 file in it, so we cache this, you can use the other
# function to get the contigtree with the HDF5, but the openening of
# file handles is annoying and this might be the best way to do this
# anyhow
@jlmem.cache
def get_base_contigtree(gexp):
from wepy.analysis.contig_tree import BaseContigTree
from wepy.boundary_conditions.receptor import UnbindingBC
wepy_h5 = get_gexp_wepy_h5(gexp)
# get the decision class for this gexp
decision = GEXP_METADATA[gexp]['decision']
_set_recursion_limit()
base_contigtree = BaseContigTree(wepy_h5,
decision_class=decision,
boundary_condition_class=UnbindingBC)
return base_contigtree
def get_contigtree(gexp):
from wepy.analysis.contig_tree import ContigTree
wepy_h5 = get_gexp_wepy_h5(gexp)
base_contigtree = get_base_contigtree(gexp)
return ContigTree(wepy_h5, base_contigtree=base_contigtree)
def all_contigtrees():
d = {}
for gexp in EXPERIMENTAL_GROUPS:
d[gexp] = get_contigtree(gexp)
return d
@jlmem.cache
def get_gexp_span_ids_span_idxs_map(gexp):
"""Get a mapping of the span_ids to the span_idxs in the
contigtree."""
contigtree = get_base_contigtree(gexp)
# get the first run idx -> span_id map
run_idxs_to_span_ids = {run_idxs[0] : span_id
for span_id, run_idxs in get_gexp_span_ids_run_idxs(gexp).items()}
span_ids_to_span_idxs_d = {}
for span_idx, trace in contigtree.span_traces.items():
run_idx = trace[0][0]
span_id = run_idxs_to_span_ids[run_idx]
span_ids_to_span_idxs_d[span_id] = span_idx
return span_ids_to_span_idxs_d
## Contigs
def get_gexp_span_contig(gexp, span_id):
# get the span idx in the contigtree from the span_id in the jobs
# table
span_ids_to_span_idxs_d = get_gexp_span_ids_span_idxs_map(gexp)
span_idx = span_ids_to_span_idxs_d[span_id]
# get the contig for this span
contigtree = get_contigtree(gexp)
contig = contigtree.span_contig(span_idx)
return contig
#+END_SRC
**** Warping
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
@jlmem.cache
def get_gexp_span_warps(gexp, span_id):
contig = get_gexp_span_contig(gexp, span_id)
warp_df = contig.warping_records_dataframe()
return warp_df
#+END_SRC
**** Parent Table and Forest
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
@jlmem.cache
def get_gexp_span_parent_forest(gexp, span_id):
contig = get_gexp_span_contig(gexp, span_id)
parent_forest = ParentForest(contig=contig)
return parent_forest
@jlmem.cache
def get_gexp_span_parent_table(gexp, span_id, discontinuities=False):
contig = get_gexp_span_contig(gexp, span_id)
parent_table = contig.parent_table(discontinuities=discontinuities)
return parent_table
#+END_SRC
**** TODO Macro-State Networks
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
@jlmem.cache
def get_state_network(gexp, observable, lag_time):
# clear cache
from wepy.analysis.network import MacroStateNetwork
contigtree = get_contigtree(gexp)
cluster_key = "observables/{}".format(observable)
net = MacroStateNetwork(contigtree, transition_lag_time=lag_time,
assg_field_key=cluster_key)
# set the macrostate weights
net.set_macrostate_weights()
# set the index of the cluster centers for each macrostate, both
# the feature index (from the raw vector of features) and the
# trace-like index (run, traj, frame)
clf = get_clustering_model(observable, gexp)
# the feature vector index (the concatenated features)
net.set_nodes_attribute('center_feature_idx',
{node_idx : feature_idx for node_idx, feature_idx
in enumerate(clf.cluster_ids_)})
# get the run trace index of each cluster center
net.set_nodes_attribute('center_idx', {node_idx : trace_idx for node_idx, trace_idx
in enumerate(cluster_center_run_trace_idxs(observable, gexp))})
return net.base_network
#+END_SRC
*** Generating Files
**** Write Reference Topology PDBs
Write out topology PDBs for all the representations of each
ligand. These should be from the initial walkers so we can also use
them as reference states.
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
def write_ref_selection_pdbs():
import os
import os.path as osp
ref_selections = ref_selection_trajs()
top_dir = data_path() / 'top'
# make sure the directory exists
os.makedirs(top_dir,
exist_ok=True)
for sel_name, sel_ref_traj in ref_selections.items():
pdb_path = top_dir / "{}_ref.pdb".format(sel_name)
sel_ref_traj.save_pdb(pdb_path)
#+END_SRC
**** Write restree
***** GEXF
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
def save_restree_gexf(gexp, span_id):
restree_path = data_path() / f'restree/gexp-{gexp}/span-{span_id}.restree.gexf'
restree_path.parent.mkdir(exist_ok=True, parents=True)
layout_graph = get_restree_graph(gexp, span_id)
nx.write_gexf(layout_graph.viz_graph, restree_path)
#+end_src
**** Warping lineages
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
def save_warp_lineages_dcds(gexp, span_id):
lineages_dir_path = data_path() / f'warp-lineages/gexp-{gexp}/span-{span_id}_warp-lineages'
lineages_dir_path.mkdir(exist_ok=True, parents=True)
lineage_traces = get_warp_lineage_traces(gexp, span_id)
contig = get_gexp_span_contig(gexp, span_id)
total_frames = 0
with contig:
for warp_idx, lineage_trace in enumerate(lineage_traces):
total_frames += len(lineage_trace)
print(f"warp: {warp_idx} has {len(lineage_trace)} frames")
path = lineages_dir_path / f'warp-{warp_idx}.dcd'
print("loading ")
traj = contig.wepy_h5.traj_fields_to_mdtraj(
contig.wepy_h5.get_trace_fields(lineage_trace,
['positions', 'box_vectors']
))
print("saving")
traj.save_dcd(str(path))
print(f"total frames: {total_frames}")
def save_warp_lineages_dcd(gexp, span_id):
lineages_path = data_path() / f'warp-lineages/gexp-{gexp}/span-{span_id}_warp-lineages.dcd'
lineages_path.parent.mkdir(exist_ok=True, parents=True)
lineages_trace = it.chain(*get_warp_lineage_traces(gexp, span_id))
print(f"total frames: {len(lineage_trace)}")
contig = get_gexp_span_contig(gexp, span_id)
print("Loading")
with contig:
traj = contig.wepy_h5.traj_fields_to_mdtraj(
contig.wepy_h5.get_trace_fields(lineages_trace,
['positions', 'box_vectors']
))
print("saving DCD")
traj.save_dcd(str(lineages_path))
#+end_src
**** Span Stats
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
def save_span_stats_table(overwrite=True):
table_path = data_path() / f'span-stats/span-stats.csv'
spans_df = get_span_stats_df()
save_table(table_path, spans_df, overwrite=overwrite)
#+end_src
*** Utilities
**** TODO Stack Toplogies
Make a topology which has multiple identical chains in it.
**** GEXF
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
def clean_gephi_gexf(gexf):
from xmltodict import parse, unparse
# parse the bad gexf file
bad_gexf = parse(gexf)
# the good tag we take things from
good_gexf_tag = """
"""
# parse the good tag and get the values
tag_attrs = {}
for key, value in parse(good_gexf_tag)['gexf'].items():
if not key.startswith('@'):
continue
else:
tag_attrs[key] = value
# read the bad xml and parse it
# then replace the tag attributes for this with the good ones
for key, value in tag_attrs.items():
bad_gexf['gexf'][key] = value
# then get the good xml string
return unparse(bad_gexf)
#+END_SRC
*** Exploring Simulations
**** Resampling Trees
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
@jlmem.cache
def get_restree_graph(gexp, span_id,
progress_key=PROGRESS_KEY):
# clear cache
from wepy.boundary_conditions.receptor import UnbindingBC
contig = get_gexp_span_contig(gexp, span_id)
layout_graph = contig.resampling_tree_layout_graph(
bc_class=UnbindingBC,
progress_key=PROGRESS_KEY,
)
return layout_graph
#+end_src
**** Exit Point trajectories
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
#
@jlmem.cache
def get_warp_lineage_traces(gexp, span_id):
contig = get_gexp_span_contig(gexp, span_id)
with contig:
warp_trace = contig.warp_contig_trace()
lineage_traces = contig.lineages(warp_trace)
return lineage_traces
def get_warp_lineage_trajs(gexp, span_id):
lineage_traces = get_warp_lineage_traces(gexp, span_id)
with contig:
trajs = []
for warp_idx, lineage_trace in enumerate(lineage_traces):
traj = contig.wepy_h5.traj_fields_to_mdtraj(
contig.wepy_h5.get_trace_fields(lineage_trace,
['positions', 'box_vectors']
))
trajs.append(traj)
return traj
#+end_src
**** Span Stats
#+begin_src python :tangle src/wepy_lysozyme_test/_tasks.py
SPANS_TABLE_COLS = (
'gexp',
'span_id',
'n_runs',
'n_warps',
'n_cycles',
'total_sampling_time_ps', # picoseconds
'warps_per_ps', # picoseconds
'rate_ps', # picoseconds
'total_warped_weight',
'warp_weight_mean',
'warp_weight_median',
'warp_weight_min',
'warp_weight_max',
'warp_wait-time_mean_ps',
'warp_wait-time_median_ps',
'warp_wait-time_min_ps',
'warp_wait-time_max_ps',
)
@jlmem.cache
def get_span_stats_df():
from wepy.analysis.rates import contig_warp_rates
jobs_df = get_jobs_df()
spans_df = copy(jobs_df).drop(columns=['success', 'status', 'jobid',])
# count the number of segments in the jobs
spans_df = spans_df.groupby(['gexp', 'span_id']).count().reset_index().rename(
columns={'segment_idx' : 'n_runs'})
# get the number of cycles and warps in the span by reading the
# contig
new_cols = {
'n_warps' : [],
'n_cycles' : [],
'total_sampling_time_ps' : [],
'warps_per_ps' : [],
'rate_ps' : [],
'total_warped_weight' : [],
'warp_weight_mean' : [],
'warp_weight_median' : [],
'warp_weight_min' : [],
'warp_weight_max' : [],
'warp_wait-time_mean_ps' : [],
'warp_wait-time_median_ps' : [],
'warp_wait-time_min_ps' : [],
'warp_wait-time_max_ps' : [],
}
for row_idx, row in spans_df.iterrows():
print(row['gexp'], row['span_id'])
contig = get_gexp_span_contig(row['gexp'], row['span_id'])
with contig:
n_cycles = contig.num_cycles
# calculate rates and fluxes
total_weight, rate, total_sampling_time = contig_warp_rates(
contig,
CYCLE_TIME
)[0][0]
warps_df = contig.warping_records_dataframe()
n_warps = len(warps_df)
# the unweighted rate of warp events
warps_per_sampling = n_warps / total_sampling_time
total_warped_weight = warps_df['weight'].sum()
# descriptive statistics on the warps weights
mean_warp_weight = warps_df['weight'].mean()
median_warp_weight = warps_df['weight'].median()
min_warp_weight = warps_df['weight'].min()
max_warp_weight = warps_df['weight'].max()
wait_times_ps = warps_df['cycle_idx'] * CYCLE_TIME.in_units_of(tkunit.picosecond)
# descriptive statistics on the warps waiting times
mean_warp_wait_time = wait_times_ps.mean()
median_warp_wait_time = wait_times_ps.median()
min_warp_wait_time = wait_times_ps.min()
max_warp_wait_time = wait_times_ps.max()
new_cols['n_cycles'].append(n_cycles)
new_cols['rate_ps'].append(rate.value_in_unit((1/tkunit.picosecond).unit))
new_cols['total_sampling_time_ps'].append(total_sampling_time.value_in_unit(tkunit.picosecond))
new_cols['n_warps'].append(n_warps)
new_cols['total_warped_weight'].append(total_warped_weight)
new_cols['warps_per_ps'].append(warps_per_sampling.value_in_unit((1/tkunit.picosecond).unit))
# warp weights statistics
new_cols['warp_weight_mean'].append(mean_warp_weight)
new_cols['warp_weight_median'].append(median_warp_weight)
new_cols['warp_weight_min'].append(min_warp_weight)
new_cols['warp_weight_max'].append(max_warp_weight)
# warp wait time statistics
new_cols['warp_wait-time_mean_ps'].append(mean_warp_wait_time)
new_cols['warp_wait-time_median_ps'].append(median_warp_wait_time)
new_cols['warp_wait-time_min_ps'].append(min_warp_wait_time)
new_cols['warp_wait-time_max_ps'].append(max_warp_wait_time)
# add the columns
for colname, col in new_cols.items():
spans_df[colname] = col
return spans_df
def span_stats_table_str():
spans_df = get_span_stats_df()
table_str = tabulate(spans_df,
headers=spans_df.columns,
tablefmt='orgtbl'
)
return table_str
#+end_src
*** Rate Plots
The calculated overall rates for the simulations are in the [[*Span Stats][Span Stats]].
These are the plots of the rates and weights over time.
**** Plotting Functions
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
def plot_rates(run_target_weights_rates, cycle_time, experimental_rt,
target_idx=0, min_probability=0.e-12,
mfpt_unit=tkunit.second,
time_unit=tkunit.picosecond,
logscale=False):
import numpy as np
import scipy.stats
import scipy.stats.mstats
RUN_LABEL_TEMPLATE = "Run: {}"
# collate data
runs_weights = {}
runs_rates = {}
runs_rts = {}
runs_times = {}
for run_idx, target_weights_rates in run_target_weights_rates.items():
run_cum_weights = []
run_rates = []
run_times = []
for cycle_idx, timepoint in enumerate(target_weights_rates):
# if there were no records (and thus no weights and rates
# i.e. an empty dictionary) we set nans here so we can
# easily discard them later but still keep track of time
if not timepoint:
#weight, rate, time = np.nan, np.nan, cycle_time * cycle_idx
weight, rate, time = 0.0, 0.0 * 1/time_unit, cycle_time * cycle_idx
else:
weight, rate, time = timepoint[target_idx]
run_cum_weights.append(weight)
run_rates.append(rate)
run_times.append(time)
# calculate the residence times
run_rts = []
for rate in run_rates:
# we can't divide by zero with simtk units so we get the
# value and invert it with a numpy float64 then
# redimensionalize it
run_rt = (np.float64(1.0) / rate.value_in_unit(rate.unit)) * (1/rate.unit)
# if the rate is nan (which was 0.0 aggregated
# probability) we catch this and explicitly set the
# residence time to infinity
# if np.isnan(rate):
# run_rt = np.inf * time_unit
# # otherwise compute the mfpt/residence time
# else:
# run_rt = 1/rate
run_rts.append(run_rt)
runs_weights[run_idx] = run_cum_weights
runs_rates[run_idx] = run_rates
runs_rts[run_idx] = run_rts
runs_times[run_idx] = run_times
# compute the average unbound probability and MFPT across all runs
# aggregate all of the runs weights into a single array and fill
# with nans where missing, these will be ignored later
num_runs = len(runs_times)
# get the longest runs number of time points
max_num_timepoints = max([len(run_times) for run_times in runs_times.values()])
min_num_timepoints = min([len(run_times) for run_times in runs_times.values()])
# get an array of the actual times for the longest one
longest_run_idx = sorted([(len(run_times), run_idx)
for run_idx, run_times in runs_times.items()],
reverse=True)[0][1]
longest_run_times = runs_times[longest_run_idx]
# the shortest one
shortest_run_idx = sorted([(len(run_times), run_idx)
for run_idx, run_times in runs_times.items()],
reverse=False)[0][1]
shortest_run_times = runs_times[shortest_run_idx]
# make an array for all of them where the longest we have is the
# shortest one
all_weights = np.empty((num_runs, min_num_timepoints))
all_weights.fill(np.nan)
# add the weights vectors to the array
for i, run_weights in enumerate(runs_weights.values()):
all_weights[i, 0:] = run_weights[0:min_num_timepoints]
# compute the arithmetic mean, ignoring nans
mean_weights = np.mean(all_weights, axis=0)
# compute standard error of means
sem_weights = scipy.stats.sem(all_weights, axis=0, nan_policy='raise')
# get the bounds of the SEM envelope around the mean so we can
# transform them
upper_sem_bounds = mean_weights + sem_weights
lower_sem_bounds = mean_weights - sem_weights
# TODO compute harmonic mean as well of the rates
# TODO: Alex wants me to calculate this just as a transformation
# of the computed mean and std errors of the probability. Not sure
# of the validity of this.
shortest_run_time_values = np.array([time.value_in_unit(time_unit)
for time in shortest_run_times])
# then compute MFPTs
# mask the values for which no data was observed (nans)
# calculate the rates for all the runs, masking the zero values
# which need special tolerances due to the small values. These
# small values are dependent upon the minimum probabilities
# allowed by the simulation so this must be a parameter.
#masked_mean_weights = np.ma.masked_invalid(mean_weights)
# actually calculate the average mfpts from the mean weights over
# time
mfpts = shortest_run_time_values / mean_weights
# compute the error in the rates using the relationship between
# the MFPT and the weight: MFPT_error = weight_error * t * weight^2
mfpt_sems = sem_weights * shortest_run_time_values * (mean_weights**-2)
upper_sem_mfpts = mfpts + mfpt_sems
lower_sem_mfpts = mfpts - mfpt_sems
# clip the things that are close to zero or less than zero since
# this is just a floating point subtraction error
close_to_zero_idx = np.argwhere(np.logical_or(np.isclose(lower_sem_mfpts, 0.),
lower_sem_mfpts < 0.))
lower_sem_mfpts[close_to_zero_idx] = 0.
# then compute the bounds for the MFPTs
# upper_sem_mfpts = shortest_run_time_values / upper_sem_bounds
# lower_sem_mfpts = shortest_run_time_values / lower_sem_bounds
# because the lower standard error bound is lower than the average
# we want to make sure the call to isclose doesn't count them as 0
# so we make a new min_prob for the masking that takes into
# account the lowest value. So we take the minimum value that is
# not identically 0
# lower_min_prob = lower_sem_bounds[lower_sem_bounds > 0.].min() * 1e-2
# actually we are going to consder anythin 2 orders of magnitude
# lower than the minimum probability to be zero so:
# lower_sem_bound_min_value = min_probability * 1e-2
# zero_masked_upper_sem_weights = np.ma.masked_values(upper_sem_bounds, 0.,
# atol=min_probability)
# zero_masked_lower_sem_weights = np.ma.masked_values(lower_sem_bounds, 0.,
# atol=lower_min_prob)
# # we calculate the standard error of the mean weights as well
# zero_masked_upper_sem_mfpts = (shortest_run_time_values /
# zero_masked_upper_sem_weights)
# zero_masked_lower_sem_mfpts = (shortest_run_time_values /
# zero_masked_lower_sem_weights)
# make the plots, one for aggregate probability the other for
# RT
fig, axes = plt.subplots(1, 2)
prob_ax = axes[0]
rt_ax = axes[1]
for run_idx in run_target_weights_rates.keys():
# do the weights plot
# get the times and weights converting if necessary
run_weights = runs_weights[run_idx]
run_times = [time.value_in_unit(time_unit) for time in runs_times[run_idx]]
# slice them to the smallest run length
run_weights = run_weights[0:min_num_timepoints]
run_times = run_times[0:min_num_timepoints]
label = RUN_LABEL_TEMPLATE.format(run_idx)
# TODO set color better
prob_ax.plot(run_times, run_weights, label=label, linewidth='3')
# then do the residence time plots
run_rts = runs_rts[run_idx]
# convert the residence times to the time unit specified
run_rts = [rt * time_unit for rt in run_rts]
shortest_run_times_values = [time.value_in_unit(time_unit) for time in shortest_run_times]
prob_ax.plot(shortest_run_times_values,
mean_weights,
label='Mean', linewidth='3', color='black')
prob_ax.fill_between(shortest_run_times_values,
mean_weights-sem_weights,
mean_weights+sem_weights,
label='Std. Error')
# get the units from the value of the rates
# rate_unit = runs_rates[list(runs_rates.keys())[0]][0].get_symbol()
# # convert to the given unit as a rate
# if time_unit is not None:
# rate_unit = 1 / (1 / rate_unit).in_units_of(time_unit)
prob_ax.set_yscale('log')
prob_ax.set_ylabel('Aggregated unbound probability')
# TODO set this intelligently
prob_ax.set_ylim([1e-13, 1e-2])
prob_ax.set_xlabel('Simulation time (${}$)'.format(time_unit.get_symbol()))
prob_ax.legend(loc='best')
# plot the average rate
# get the times and convert to the proper units
times = [time.value_in_unit(time_unit) for time in shortest_run_times]
# first because they were dedimensionalized we redimensionalize
# them to the specified time_unit (x-axis) then convert them to
# the MFPT unit, then dedimensionalize them to raw values for
# plotting
mfpt_values = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit)
for mfpt in mfpts])
upper_sem_mfpt_values = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit)
for mfpt in upper_sem_mfpts])
lower_sem_mfpt_values = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit)
for mfpt in lower_sem_mfpts])
# clip the things that are close to zero or less than zero since
# this is just a floating point error from multiplying...
close_to_zero_idx = np.argwhere(np.logical_or(np.isclose(lower_sem_mfpt_values, 0.),
lower_sem_mfpts < 0.))
lower_sem_mfpt_values[close_to_zero_idx] = 0.
rt_ax.plot(times, mfpt_values, label='Mean', linewidth='3', color='black')
rt_ax.fill_between(times,
upper_sem_mfpt_values, lower_sem_mfpt_values,
label='Std. Error of Mean')
if logscale:
rt_ax.set_yscale('log')
# plt.xticks([400, 800, 1200, 1600], ['400', '800', '1200', '1600'])
# plt.yticks([0.1, 10, 1000, 100000, 10000000, 1000000000])
rt_ax.set_xlabel('Simulation time ({})'.format(time_unit.get_symbol()))
rt_ax.set_ylabel('Predicted residence time ({})'.format(mfpt_unit.get_symbol()))
#rt_ax.set_ylim([1e-1, 1e9])
# plot the experimental value
experimental_points = [experimental_rt.value_in_unit(mfpt_unit) for _ in run_times]
rt_ax.plot(times, experimental_points, color='r', label='Experimental')
rt_ax.legend()
# return the actual computed values for the average aggregated
# weights and residence times
mfpts = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit)
for mfpt in mfpts]) * mfpt_unit
mfpt_sems = np.array([(mfpt * time_unit).value_in_unit(mfpt_unit)
for mfpt in mfpt_sems]) * mfpt_unit
return mean_weights, sem_weights, mfpts, mfpt_sems, shortest_run_times, (fig, prob_ax, rt_ax)
#+END_SRC
**** Plot Rates and RT
Actually perform the plotting for different ligands and runs.
For this we want to plot each run independently and then aggregate
them.
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_tasks.py
def gexp_plot_rates_rt(gexp):
import matplotlib.pyplot as plt
from wepy.analysis.rates import contig_warp_rates
span_rates = {}
for span_id in get_gexp_span_ids(gexp):
contig = get_gexp_span_contig(gexp, span_id)
with contig:
# get the values as a series
contig_rates = contig_warp_rates(contig, CYCLE_TIME, time_points=Ellipsis)
span_rates[span_id] = contig_rates
# make the plot for each span
mean_weights, sem_weights, mfpts, mfpt_sems, total_sampling_times, axes = \
plot_rates(span_rates, CYCLE_TIME, EXPERIMENTAL_RT,
target_idx=0, min_probability=WE_PMIN,
mfpt_unit=tkunit.minute,
time_unit=tkunit.microsecond,
logscale=True)
print("Gexp: {}".format(gexp))
print("Final MFPT: {}".format(mfpts[-1]))
print("Final MFPT SEM: {}".format(mfpt_sems[-1]))
# TODO: the total sampling time will be gotten another way
##################################################################
# print("Total sampling time: {}".format( #
# total_sampling_times[-1].value_in_unit(tkunit.microsecond) #
# * tkunit.microsecond #
# * len(contigtree.span_traces)) #
# ) #
##################################################################
plt.show()
#+END_SRC
*** Free Energy Profiles
** Prefect Workflows
*** Header
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/_pipelines.py
"""Generated file from the analysis.org file. Do not edit directly."""
import inspect
from prefect import Flow
import prefect
import wepy.lysozyme_test._tasks as tasks_module
# these helper functions are for automatically listing all of the
# functions defined in the tasks module
def is_mod_function(mod, func):
return inspect.isfunction(func) and inspect.getmodule(func) == mod
def get_functions(mod):
# get only the functions that aren't module functions and that
# aren't private
return {func.__name__ : func for func in mod.__dict__.values()
if (is_mod_function(mod, func) and
not func.__name__.startswith('_')) }
# get the task functions and wrap them as prefect tasks
tasks = {name : prefect.task(func)
for name, func in get_functions(tasks_module).items()}
#+END_SRC
*** Example
#+begin_src python :tangle src/wepy_lysozyme_test/_pipelines.py
test_flow = Flow("Test flow")
# you can add tasks this way:
with test_flow:
result = tasks['test']()
#+end_src
** Execution
A less heavyweight alternative to running pipelines like below.
Each execution instance will become a submodule of the
'project_name.execution' module.
You can run them like this:
#+begin_src bash
python -m project_name.execution.my_execution_script
#+end_src
Execution scripts should be self contained in terms of domain
parameters.
An execution script may have command line parameters related to
execution tweaking. I.e. which dask cluster to use, how many cores,
etc.
**** Executors
Functions that allow for specifying different executions. These should
only be called under ~if __name__ == "__main__"~ blocks as they will
ask for command line input.
***** Local Machine
Trivial example of an executor that just runs the function.
#+begin_src python :tangle src/wepy_lysozyme_test/execution/__init__.py
def execute_locally(func):
func()
#+end_src
***** Local Dask Cluster
Either connect to an existing dask cluster or start one up locally.
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/execution/__init__.py
def dask_execute(func, processes=False, n_workers=4):
import sys
from dask.distributed import Client, LocalCluster
cluster_address = sys.argv[1]
DASHBOARD_PORT = 9998
if cluster_address == ':local':
cluster = LocalCluster(processes=processes,
n_workers=n_workers,
dashboard_address=":{}".format(DASHBOARD_PORT))
print("Ad hoc cluster online. Dashboard on port {}".format(DASHBOARD_PORT))
client = Client(cluster)
else:
client = Client(cluster_address)
func(client)
#+END_SRC
***** Prefect Pipeline
**** Scripts
***** Example: Raw
An example showing that you don't need any framework to help you run
something.
While tasks should be functional (and the only state saved is caching)
you can handle side effects like saving files etc. here.
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/execution/example_raw.py
def make_result(message):
from wepy.lysozyme_test._tasks import test
test()
return "Here is the result: " + message
if __name__ == "__main__":
result = make_result("Testing execution out")
#+END_SRC
***** Example: Using a Dask Cluster
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/execution/example_dask.py
# the function here where the first argument must be a client to the
# cluster
def func_closure(client):
from wepy.lysozyme_test._tasks import important_calculation
result = client.submit(important_calculation, "logging..").result()
# this defines which format to save it in, we are using the joblib
# pickle format
ext = 'jl.pkl'
result_file_path = osp.join(data_path(), f'my_results/result_A.{ext}')
# save the result to the file data store in the joblib pickle
# format
save_obj(result_file_path,
result,
overwrite=True,
ext='jl.pkl')
if __name__ == "__main__":
from wepy.lysozyme_test.execution import execute
# execute and receive options from command line
dask_execute(func_closure)
#+END_SRC
***** Example: Prefect Flow
#+BEGIN_SRC python :tangle src/wepy_lysozyme_test/execution/example_flow.py
if __name__ == "__main__":
from prefect.engine.executors import Executor
# get the flow
from wepy_lysozyme_test._pipelines import test_flow
# instantiate an executor from Prefect. We use the local one here
# for testing
executor = Executor()
# run the flow with the executor
state = test_flow.run(executor=executor)
#+END_SRC
***** Link Results
#+begin_src python :tangle src/wepy_lysozyme_test/execution/link_results.py
def link_results():
from wepy_lysozyme_test._tasks import link_gexp_hdf5s, EXPERIMENTAL_GROUPS
for gexp in EXPERIMENTAL_GROUPS:
path = link_gexp_hdf5s(gexp)
print(f"Experimental group {gexp} results at {path}")
if __name__ == "__main__":
link_results()
#+end_src
***** Resampling Tree Visualizations
#+begin_src python :tangle src/wepy_lysozyme_test/execution/gen_restrees.py
import click
@click.command()
@click.argument('gexp')
def gen_restrees(gexp):
from wepy_lysozyme_test._tasks import (
get_gexp_span_ids,
save_restree_gexf,
)
for span_id in get_gexp_span_ids(gexp):
save_restree_gexf(gexp, span_id)
if __name__ == "__main__":
gen_restrees()
#+end_src
***** Warping lineage trajectories
****** Separated
#+begin_src python :tangle src/wepy_lysozyme_test/execution/gen_warp_lineages_multi.py
import click
@click.command()
@click.argument('gexp')
def gen_warp_lineages_multi(gexp):
from wepy_lysozyme_test._tasks import (
get_gexp_span_ids,
save_warp_lineages_dcds,
)
print(f"Working on gexp: {gexp}")
for span_id in get_gexp_span_ids(gexp):
print(f"Working on span {span_id}")
save_warp_lineages_dcds(gexp, span_id)
if __name__ == "__main__":
gen_warp_lineages_multi()
#+end_src
****** Concatenated
#+begin_src python :tangle src/wepy_lysozyme_test/execution/gen_warp_lineages_single.py
import click
@click.command()
@click.argument('gexp')
def gen_warp_lineages_single(gexp):
from wepy_lysozyme_test._tasks import (
get_gexp_span_ids,
save_warp_lineages_dcd,
)
print(f"Working on gexp: {gexp}")
for span_id in get_gexp_span_ids(gexp):
print(f"Working on span {span_id}")
save_warp_lineages_dcd(gexp, span_id)
if __name__ == "__main__":
gen_warp_lineages_single()
#+end_src
***** Warp plots
#+begin_src python :tangle src/wepy_lysozyme_test/execution/show_rate_plots.py
import click
@click.command()
@click.argument('gexp')
def show_rate_plots(gexp):
from wepy_lysozyme_test._tasks import (
gexp_plot_rates_rt,
)
gexp_plot_rates_rt(gexp)
if __name__ == "__main__":
show_rate_plots()
#+end_src
***** Contig Stats tables
#+begin_src python :tangle src/wepy_lysozyme_test/execution/gen_contig_stats_table.py
import click
@click.command()
def gen_contig_stats_table():
from tabulate import tabulate
from wepy_lysozyme_test._tasks import (
save_span_stats_table,
span_stats_table_str,
)
save_span_stats_table(overwrite=True)
click.echo(span_stats_table_str())
if __name__ == "__main__":
gen_contig_stats_table()
#+end_src
* Invocations
The actual invocations you will make on the command line to run stuff.
Use TODO or checkboxes to manage them.
** NoResampler test jobs
*** DONE <2019-11-15 Fri 18:25>
#+begin_src bash
scancel 49420612
#+end_src
*** DONE <2019-11-19 Tue>
#+begin_src bash
(cd hpcc/wepy && sbatch submissions/noresampler_test.sh.slurm)
#+end_src
#+begin_example
(base) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Tue Nov 1912:12:21} 0:
--> (cd hpcc/wepy && sbatch submissions/noresampler_test.sh.slurm)
Submitted batch job 49870580
#+end_example
*** DONE <2019-11-19 Tue 13:20>
#+begin_src bash
(cd hpcc/wepy && sbatch submissions/noresampler_test.sh.slurm)
#+end_src
#+begin_example
(wepy.lysozyme_test.sims) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Tue Nov 1901:19:55} 0:
--> (cd hpcc/wepy && sbatch submissions/noresampler_test.sh.slurm)
Submitted batch job 49874750
#+end_example
** Resampler Comparison Simulation jobs
*** DONE Run tests with new setups
#+begin_src bash
(cd hpcc/wepy && (
sbatch submissions/revo_test.sh.slurm;
sbatch submissions/wexplore_test.sh.slurm;
))
#+end_src
*** DONE Run jobs
Run 3-4 replicates each:
#+begin_src bash
(cd hpcc/wepy && (
# the resampler ones
for i in {0,1,2,3}; do
sbatch submissions/noresampler_test.sh.slurm;
sbatch submissions/revo_test.sh.slurm;
sbatch submissions/wexplore_test.sh.slurm;
done
))
#+end_src
**** <2019-11-20 Wed>
#+begin_example
(base) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Wed Nov 2002:44:56} 0:
--> (cd hpcc/wepy && (
> # the resampler ones
> for i in {0,1,2,3}; do
> sbatch submissions/noresampler_test.sh.slurm;
> sbatch submissions/revo_test.sh.slurm;
> sbatch submissions/wexplore_test.sh.slurm;
> done
> ))
Submitted batch job 49952117
Submitted batch job 49952118
Submitted batch job 49952119
Submitted batch job 49952120
Submitted batch job 49952121
Submitted batch job 49952122
Submitted batch job 49952123
Submitted batch job 49952124
Submitted batch job 49952125
Submitted batch job 49952126
Submitted batch job 49952127
Submitted batch job 49952128
#+end_example
**** <2019-11-21 Thu 12:10>
I changed the file names out from under the job scheduler so some of
them failed.
Resubmit them:
#+begin_src bash
(cd hpcc/wepy &&
sbatch submissions/revo.sh.slurm;
sbatch submissions/wexplore.sh.slurm
)
#+end_src
#+begin_example
(common) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Thu Nov 2112:05:58} 0:
--> (cd hpcc/wepy &&
> sbatch submissions/revo.sh.slurm;
> sbatch submissions/wexplore.sh.slurm
> )
Submitted batch job 50036052
Submitted batch job 50036053
(common) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Thu Nov 2112:11:50} 0:
--> (cd hpcc/wepy && sbatch submissions/noresampler_ensemble.sh.slurm; )
Submitted batch job 50036078
#+end_example
** Single walker simulation jobs
Run 3-4 replicates each:
#+begin_src bash
(cd hpcc/wepy && (
# the resampler ones
for i in {0,1,2,3}; do
sbatch submissions/noresampler_single.sh.slurm;
done
))
#+end_src
*** <2019-11-21 Thu 12:06>
#+begin_example
(common) lotzsamu@dev-intel16-k80 :$19: [/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test]{Thu Nov 2112:05:39} 0:
--> (cd hpcc/wepy && (
> # the resampler ones
> for i in {0,1,2,3}; do
> sbatch submissions/noresampler_single.sh.slurm;
> done
> ))
Submitted batch job 50035864
Submitted batch job 50035866
Submitted batch job 50035867
Submitted batch job 50035868
#+end_example
** Simulations Backups and saving
*** <2019-12-03 Tue 13:21>
- [X] First get them to the local computer
#+begin_src bash
rsync -ravvhhiz \
lotzsamu@rsync.hpcc.msu.edu:/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs/ \
$HOME/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs
#+end_src
- [X] Pop the links out to resources
#+begin_src bash
symlink-pop /home/salotz/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs \
/home/salotz/tree/lab/resources/project-resources/wepy.lysozyme_test/hpcc/wepy/jobs
#+end_src
- [ ] then back them up to volta
Just doing the whole tree since that needs to be done anyways.
#+begin_src bash
inv refugue.rsync --source superior --target volta
#+end_src
- [ ] backup to allegheny
#+begin_src bash
inv refugue.rsync --source superior --target allegheny
#+end_src
** Analysis
*** Resampling Trees
**** <2019-12-05 Thu 19:09>
- [ ]
#+begin_src bash
python -m wepy_lysozyme_test.execution.gen_restrees 'WExplore'
#+end_src
- [ ]
#+begin_src bash
python -m wepy_lysozyme_test.execution.gen_restrees 'REVO'
#+end_src
This isn't working because they are too long.
*** Warping Lineages
**** <2019-12-09 Mon 13:04>
To get them all in one trajectory file:
This is failing because some of them are too large I think.
#+begin_src bash
(for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do
python -m wepy_lysozyme_test.execution.gen_warp_lineages $gexp || exit 1
done
)
#+end_src
Try this without all of them.
#+begin_src bash
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No'
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single'
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO'
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore'
#+end_src
#+begin_src bash
# No
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No' 0
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No' 1
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No' 2
# No_single
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single' 0
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single' 1
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single' 2
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'No_single' 3
# REVO
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO' 0
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO' 1
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO' 2
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'REVO' 3
# WExplore
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore' 0
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore' 1
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore' 2
python -m wepy_lysozyme_test.execution.gen_warp_lineages 'WExplore' 3
#+end_src
**** <2019-12-10 Tue 11:50>
After fixing recursion issue this should be better now.
#+begin_src bash
(for gexp in 'No' 'No_single' 'REVO'; do
python -m wepy_lysozyme_test.execution.gen_warp_lineages_multi $gexp || exit 1
done
)
#+end_src
This worked so well that it froze my machine. I'm not sure where it
failed, but it must have been on REVO because there are none written
for it.
Lets try this one at a time again.
#+begin_src bash
python -m wepy_lysozyme_test.execution.gen_warp_lineages_single 'WExplore'
python -m wepy_lysozyme_test.execution.gen_warp_lineages_single 'REVO'
#+end_src
*** Spans Table
**** <2019-12-10 Tue 11:52>
#+begin_src bash
python -m wepy_lysozyme_test.execution.gen_contig_stats_table
#+end_src
Here is the result in org format:
| | gexp | span_id | n_runs | n_warps | n_cycles | total_sampling_time_ps | warps_per_ps | rate_ps | total_warped_weight | warp_weight_mean | warp_weight_median | warp_weight_min | warp_weight_max | warp_wait-time_mean_ps | warp_wait-time_median_ps | warp_wait-time_min_ps | warp_wait-time_max_ps |
|----+-----------+-----------+----------+-----------+------------+--------------------------+----------------+-------------+-----------------------+--------------------+----------------------+-------------------+-------------------+--------------------------+----------------------------+-------------------------+-------------------------|
| 0 | No | 0 | 1 | 7 | 1101 | 1.05696e+06 | 6.62277e-06 | 1.37974e-07 | 0.145833 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 18317.1 | 17960 | 14580 | 21240 |
| 1 | No | 1 | 1 | 21 | 4645 | 4.4592e+06 | 4.70936e-06 | 9.81118e-08 | 0.4375 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 48672.4 | 41940 | 9760 | 92120 |
| 2 | No | 2 | 1 | 4 | 1107 | 1.06272e+06 | 3.76393e-06 | 7.84151e-08 | 0.0833333 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 14915 | 15830 | 6460 | 21540 |
| 3 | No | 3 | 1 | 7 | 1139 | 1.09344e+06 | 6.40181e-06 | 1.33371e-07 | 0.145833 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 18940 | 19580 | 11420 | 22580 |
| 4 | No_single | 0 | 1 | 2 | 42914 | 858280 | 2.33024e-06 | 2.33024e-06 | 2 | 1 | 1 | 1 | 1 | 759060 | 759060 | 748720 | 769400 |
| 5 | No_single | 1 | 1 | 2 | 41619 | 832380 | 2.40275e-06 | 2.40275e-06 | 2 | 1 | 1 | 1 | 1 | 730970 | 730970 | 659480 | 802460 |
| 6 | No_single | 2 | 1 | 4 | 42058 | 841160 | 4.75534e-06 | 4.75534e-06 | 4 | 1 | 1 | 1 | 1 | 120240 | 119320 | 44640 | 197680 |
| 7 | No_single | 3 | 1 | 3 | 42642 | 852840 | 3.51766e-06 | 3.51766e-06 | 3 | 1 | 1 | 1 | 1 | 510460 | 478380 | 449320 | 603680 |
| 8 | REVO | 0 | 1 | 1381 | 1142 | 1.09632e+06 | 0.00125967 | 3.6137e-08 | 0.0396177 | 2.86877e-05 | 5.29838e-11 | 1.00581e-12 | 0.0308164 | 13088 | 13760 | 1100 | 22780 |
| 9 | REVO | 1 | 1 | 1458 | 1126 | 1.08096e+06 | 0.0013488 | 3.22975e-09 | 0.00349123 | 2.39453e-06 | 1.01315e-10 | 1.01126e-12 | 0.00152529 | 14434 | 14900 | 320 | 22500 |
| 10 | REVO | 2 | 1 | 469 | 1103 | 1.05888e+06 | 0.000442921 | 3.23438e-13 | 3.42482e-07 | 7.3024e-10 | 5.8895e-12 | 1.0023e-12 | 2.54594e-08 | 12041.7 | 11660 | 340 | 21900 |
| 11 | REVO | 3 | 1 | 1029 | 1100 | 1.056e+06 | 0.000974432 | 5.1951e-09 | 0.00548602 | 5.33141e-06 | 1.69055e-11 | 1.00155e-12 | 0.00320306 | 10512.9 | 9380 | 160 | 21980 |
| 12 | WExplore | 0 | 1 | 70 | 1134 | 1.08864e+06 | 6.43004e-05 | 7.02838e-08 | 0.0765138 | 0.00109305 | 9.23911e-08 | 1.92453e-12 | 0.0387847 | 11624.3 | 11340 | 420 | 21720 |
| 13 | WExplore | 1 | 1 | 85 | 1129 | 1.08384e+06 | 7.84249e-05 | 1.58479e-07 | 0.171766 | 0.00202078 | 1.91101e-08 | 4.27336e-12 | 0.0670075 | 10480.9 | 9360 | 920 | 22320 |
| 14 | WExplore | 2 | 1 | 264 | 4585 | 4.4016e+06 | 5.99782e-05 | 8.50116e-09 | 0.0374187 | 0.000141738 | 1.07901e-07 | 1.04765e-12 | 0.0138983 | 47844.8 | 50220 | 1680 | 91560 |
| 15 | WExplore | 3 | 1 | 67 | 1095 | 1.0512e+06 | 6.37367e-05 | 3.54706e-07 | 0.372867 | 0.00556517 | 5.52999e-07 | 1.84583e-12 | 0.357752 | 11421.2 | 11540 | 320 | 21300 |
*** Rates
**** <2019-12-10 Tue 21:43>
#+begin_src bash
(for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do
python -m wepy_lysozyme_test.execution.show_rate_plots $gexp || exit 1
done)
#+end_src
For the 'No' simulations
#+begin_example
Traceback (most recent call last):
File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/execution/show_rate_plots.py", line 15, in
show_rate_plots()
File "/home/salotz/.local/lib/python3.6/site-packages/click/core.py", line 764, in __call__
return self.main(*args, **kwargs)
File "/home/salotz/.local/lib/python3.6/site-packages/click/core.py", line 717, in main
rv = self.invoke(ctx)
File "/home/salotz/.local/lib/python3.6/site-packages/click/core.py", line 956, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/home/salotz/.local/lib/python3.6/site-packages/click/core.py", line 555, in invoke
return callback(*args, **kwargs)
File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/execution/show_rate_plots.py", line 11, in show_rate_plots
gexp_plot_rates_rt(gexp)
File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 1333, in gexp_plot_rates_rt
contig = get_gexp_span_contig(gexp, span_id)
File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 672, in get_gexp_span_contig
span_ids_to_span_idxs_d = get_gexp_span_ids_span_idxs_map(gexp)
File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/joblib/memory.py", line 568, in __call__
return self._cached_call(args, kwargs)[0]
File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/joblib/memory.py", line 534, in _cached_call
out, metadata = self.call(*args, **kwargs)
File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/joblib/memory.py", line 734, in call
output = self.func(*args, **kwargs)
File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 653, in get_gexp_span_ids_span_idxs_map
for span_id, run_idxs in get_gexp_span_ids_run_idxs(gexp).items()}
File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 583, in get_gexp_span_ids_run_idxs
jobs_runs_d = get_gexp_jobs_to_runs_map(gexp)
File "/home/salotz/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py", line 554, in get_gexp_jobs_to_runs_map
jobid = run_grp.attrs['jobid']
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/h5py/_hl/attrs.py", line 60, in __getitem__
attr = h5a.open(self._id, self._e(name))
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py/h5a.pyx", line 77, in h5py.h5a.open
KeyError: "Can't open attribute (can't locate attribute: 'jobid')"
#+end_example
**** <2019-12-11 Wed 15:26>
#+begin_src bash
(for gexp in 'No' 'No_single' 'REVO' 'WExplore'; do
python -m wepy_lysozyme_test.execution.show_rate_plots $gexp || exit 1
done)
#+end_src
*** TODO Compute Observables
** Troubleshooting
*** TODO jobids missing
*** DONE Bad heap HDF5 thing
#+begin_example
In [5]: get_gexp_span_contig('No', 3)
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
in
----> 1 get_gexp_span_contig('No', 3)
~/tree/lab/projects/wepy.lysozyme_test/src/wepy_lysozyme_test/_tasks.py in get_gexp_span_contig(gexp, span_id)
557 # get the contig for this span
558 contigtree = get_contigtree(gexp)
--> 559 contig = contigtree.span_contig(span_idx)
560
561 return contig
~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in span_contig(self, span_idx)
215 """Generates a contig object for the specified spanning contig."""
216
--> 217 contig = self.make_contig(self.span_traces[span_idx])
218
219 return contig
~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in make_contig(self, contig_trace)
1329 continuations=continuations,
1330 boundary_condition_class=self.boundary_condition_class,
-> 1331 decision_class=self.decision_class)
1332
1333
~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in __init__(self, wepy_h5, **kwargs)
1413
1414 # use the superclass initialization
-> 1415 super().__init__(wepy_h5, **kwargs)
1416
1417 # check that the result is a single contig
~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in __init__(self, wepy_h5, base_contigtree, continuations, runs, boundary_condition_class, decision_class)
1207 runs=runs,
1208 boundary_condition_class=boundary_condition_class,
-> 1209 decision_class=decision_class)
1210
1211 self._set_base_contigtree_to_self(new_contigtree)
~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in __init__(self, wepy_h5, continuations, runs, boundary_condition_class, decision_class)
175 self._create_tree(wepy_h5)
176
--> 177 self._set_resampling_panels(wepy_h5)
178
179 if self._decision_class is not None:
~/tree/lab/devel/wepy/src/wepy/analysis/contig_tree.py in _set_resampling_panels(self, wepy_h5)
264 for run_idx in self.run_idxs:
265
--> 266 run_resampling_panel = wepy_h5.run_resampling_panel(run_idx)
267
268 # add each cycle of this panel to the network by adding
~/tree/lab/devel/wepy/src/wepy/hdf5.py in run_resampling_panel(self, run_idx)
4883
4884 """
-> 4885 return self.run_contig_resampling_panel([run_idx])
4886
4887 def run_contig_resampling_panel(self, run_idxs):
~/tree/lab/devel/wepy/src/wepy/hdf5.py in run_contig_resampling_panel(self, run_idxs)
4908
4909 # make the resampling panel from the resampling records for the contig
-> 4910 contig_resampling_panel = resampling_panel(self.resampling_records(run_idxs),
4911 is_sorted=False)
4912
~/tree/lab/devel/wepy/src/wepy/hdf5.py in resampling_records(self, run_idxs)
4680 """
4681
-> 4682 return self.run_contig_records(run_idxs, RESAMPLING)
4683
4684 def resampling_records_dataframe(self, run_idxs):
~/tree/lab/devel/wepy/src/wepy/hdf5.py in run_contig_records(self, run_idxs, run_record_key)
4613 # sporadic then we just use the cycle idxs
4614 if self._is_sporadic_records(run_record_key):
-> 4615 records = self._run_records_sporadic(run_idxs, run_record_key)
4616 else:
4617 records = self._run_records_continual(run_idxs, run_record_key)
~/tree/lab/devel/wepy/src/wepy/hdf5.py in _run_records_sporadic(self, run_idxs, run_record_key)
2055 # get all the value columns from the datasets, and convert
2056 # them to something amenable to a table
-> 2057 run_fields = self._convert_record_fields_to_table_columns(run_idx, run_record_key)
2058
2059 # we need to concatenate each field to the end of the
~/tree/lab/devel/wepy/src/wepy/hdf5.py in _convert_record_fields_to_table_columns(self, run_idx, run_record_key)
1978 for record_field in self.record_fields[run_record_key]:
1979 fields[record_field] = self._convert_record_field_to_table_column(
-> 1980 run_idx, run_record_key, record_field)
1981
1982 return fields
~/tree/lab/devel/wepy/src/wepy/hdf5.py in _convert_record_field_to_table_column(self, run_idx, run_record_key, record_field)
1931 # cast all elements to tuples
1932 if h5py.check_dtype(vlen=dset.dtype) is not None:
-> 1933 rec_dset = [tuple(value) for value in dset[:]]
1934
1935 # if it is not variable length make sure it is not more than a
h5py/_objects.pyx in h5py._objects.with_phil.wrapper()
h5py/_objects.pyx in h5py._objects.with_phil.wrapper()
~/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/h5py/_hl/dataset.py in __getitem__(self, args)
571 mspace = h5s.create_simple(mshape)
572 fspace = selection.id
--> 573 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)
574
575 # Patch up the output for NumPy
h5py/_objects.pyx in h5py._objects.with_phil.wrapper()
h5py/_objects.pyx in h5py._objects.with_phil.wrapper()
h5py/h5d.pyx in h5py.h5d.DatasetID.read()
h5py/_proxy.pyx in h5py._proxy.dset_rw()
h5py/_proxy.pyx in h5py._proxy.H5PY_H5Dread()
OSError: Can't read data (bad global heap collection signature)
#+end_example
50036078 is the job ID it is failing on. Lets try recopying it. Maybe
it had an error.
Found out that specifically it is the target idxs that are kind of screwy.
#+begin_example
(pdb++)>>> no = self._convert_record_field_to_table_column(run_idx, run_record_key, 'target_idxs')
,*** OSError: Can't read data (bad global heap collection signature)
Traceback (most recent call last):
File "/home/salotz/tree/lab/devel/wepy/src/wepy/hdf5.py", line 1933, in _convert_record_field_to_table_column
rec_dset = [tuple(value) for value in dset[:]]
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "/home/salotz/anaconda3/envs/wepy.lysozyme_test.common_dev/lib/python3.7/site-packages/h5py/_hl/dataset.py", line 573, in __getitem__
self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py/h5d.pyx", line 182, in h5py.h5d.DatasetID.read
File "h5py/_proxy.pyx", line 158, in h5py._proxy.dset_rw
File "h5py/_proxy.pyx", line 84, in h5py._proxy.H5PY_H5Dread
#+end_example
**** Trying to read them straight from the allegheny disk locally
#+begin_src python
from wepy.hdf5 import WepyHDF5
path = '/media/salotz/allegheny/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root.wepy.h5'
wepy_h5 = WepyHDF5(path, mode='r')
with wepy_h5:
dset = wepy_h5.records_grp(0, 'resampling')['target_idxs']
dset[:]
#+end_src
**** HPCC's copy
#+begin_src python
from wepy.hdf5 import WepyHDF5
path = '/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs/50036078/root.wepy.h5'
wepy_h5 = WepyHDF5(path, mode='r')
with wepy_h5:
dset = wepy_h5.records_grp(0, 'resampling')['target_idxs']
dset[:]
#+end_src
Okay this one is fine. Something happened while it was being copied...
**** solution
Lets get it back. This might take a while...
#+begin_src bash
rsync lotzsamu@rsync.hpcc.msu.edu:/mnt/gs18/scratch/users/lotzsamu/tree/lab/projects/wepy.lysozyme_test/hpcc/wepy/jobs/50036078/root.wepy.h5 \
/home/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root.wepy.h5
#+end_src
Its actually only 9 GB so it should be fine.
Then copy it manually to the drive to make sure it overrides it.
#+begin_src bash
rsync /home/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root.wepy.h5 \
/media/salotz/allegheny/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root.wepy.h5
#+end_src
I had renamed them so get rid of the bad ones once we know it works:
#+begin_src bash
rm -f /home/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root_bad.wepy_h5
rm -f /media/salotz/allegheny/salotz/tree/lab/projects/wepy.lysozyme_test/data/sims/wepy/jobs/50036078/root_bad.wepy.h5
#+end_src
*** DONE Getting all lineages of terminal walkers
#+begin_src python :tangle troubleshoot/terminal_lineages.py
from wepy.hdf5 import WepyHDF5
from wepy.analysis.contig_tree import BaseContigTree, ContigTree
from wepy.resampling.decisions.clone_merge import MultiCloneMergeDecision
from wepy.resampling.decisions.decision import NoDecision
from wepy.boundary_conditions.receptor import UnbindingBC
from wepy_lysozyme_test._tasks import *
test_h5_path = project_path() / "tmp/test_wepy_sim/root.wepy.h5"
wepy_h5 = WepyHDF5(test_h5_path, mode='r')
base_contigtree = BaseContigTree(wepy_h5,
decision_class=MultiCloneMergeDecision,
boundary_condition_class=UnbindingBC)
contigtree = ContigTree(wepy_h5, base_contigtree=base_contigtree)
res_trace = contigtree.resampling_trace(MultiCloneMergeDecision.ENUM.SQUASH)
warp_trace = contigtree.warp_trace()
final_trace = contigtree.final_trace()
# deduplicate and make the final collection of terminals in a trace
terminal_trace = list(set(res_trace) | set(warp_trace) | set(final_trace))
# then we can get all the lineages for them
lines = contigtree.lineages(terminal_trace)
# TODO: do the full parent tree traversals
# make the parent tree
parent_forest = ParentForest(contig=contig)
# preorder traversal
preorder_traversal_trace = list(nx.dfs_preorder_nodes(parent_forest._graph))
# convert to a walker trace
preorder_traversal_trace = contig.walker_trace_to_run_trace(
[(traj_idx, cycle_idx) for cycle_idx, traj_idx in preorder_traversal_trace
if cycle_idx > -1])
span_data['preorder_traversal_trace'] = preorder_traversal_trace
# post order traversal
postorder_traversal_trace = list(nx.dfs_postorder_nodes(parent_forest._graph))
# convert to a walker trace
postorder_traversal_trace = contig.walker_trace_to_run_trace(
[(traj_idx, cycle_idx) for cycle_idx, traj_idx in postorder_traversal_trace
if cycle_idx > -1])
span_data['postorder_traversal_trace'] = postorder_traversal_trace
## cohort trajs
#+end_src
*** DONE Running Test Simulations
Run a simulation:
#+begin_src bash
./scripts/noresampler_local_test.sh
#+end_src
Now you have some data at: tmp/test_wepy_sims/root.wepy.h5
* Management
Area for managed data like lists and spreadsheets.
** Research
*** Info on the lysozyme system
See citations on bibsonomy.org:
https://www.bibsonomy.org/search/tags:lysozyme%20group:comp-pharm-bio
*** Papers:
**** cite:Baase2010T4LysozymeReview
Review on what the Lysozyme system has taught use. Details on the L99A
mutation cavity formation.
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2867005/
**** cite:NunesAlves2018T4LysozymeUnbinding
Paper that does WE simulations with a bunch of hand-crafted progress
coordinates in WESTPA.
https://www.sciencedirect.com/science/article/pii/S0006349518301401
#+BEGIN_QUOTE
The experimental rate constant for benzene dissociation () from T4L
L99A mutant has been determined (58) by lineshape analysis of protein
NMR as 950 s–1 at 303 K.
#+END_QUOTE
#+BEGIN_SRC python
import simtk.unit as unit
rate = 950 * (1/unit.second)
rt = 1/ rate
print(rt)
#+END_SRC
- residence time :: 0.0010526315789473684 s
- PDB structure :: 1L83
https://www.rcsb.org/structure/1L83
MD parameters:
- CHARMM36
- implicit solvation :: Born surface area form
- born radii :: OBC
- dialectric constant :: 80
- nonpolar contributions :: M. Schaefer, C. Bartels, M. Karplus Solution conformations and thermodynamics of structured peptides: molecular dynamics simulation with an implicit solvation model J. Mol. Biol., 284 (1998), pp. 835-848
- surface tension :: 5.4 cal mol^{-1} \AA^{-2}
- temperature :: 300 K
- integration step size :: 2 fs
- integrator :: leapfrog stochastic
- collision frequency :: 10 ps^{-1}
- Hydrogen-Heavy atoms constrained :: LINCS (faster SHAKE algorithm)
WE parameters:
- cycles :: 4000
- cycle time :: 10 ps at 400 K, 2 ps at 300 K
- traj-per-bin :: 4-5
**** cite:Wang2016LysozymeMetadynamicsUnbinding
Metadynamics study of ligand unbinding from lysozyme.
**** cite:Feher1996LysozymeBinding
Experimental rate constant paper.
**** Bibtex entries
#+BEGIN_SRC bibtex :tangle README.bib
@article{Baase2010T4LysozymeReview,
abstract = {An overview is presented of some of the major insights that have come from studies of the structure, stability, and folding of T4 phage lysozyme. A major purpose of this review is to provide the reader with a complete tabulation of all of the variants that have been characterized, including melting temperatures, crystallographic data, Protein Data Bank access codes, and references to the original literature. The greatest increase in melting temperature (T(m)) for any point mutant is 5.1 degrees C for the mutant Ser 117 --> Val. This is achieved in part not only by hydrophobic stabilization but also by eliminating an unusually short hydrogen bond of 2.48 A that apparently has an unfavorable van der Waals contact. Increases in T(m) of more than 3-4 degrees C for point mutants are rare, whereas several different types of destabilizing substitutions decrease T(m) by 20 degrees C or thereabouts. The energetic cost of cavity creation and its relation to the hydrophobic effect, derived from early studies of "large-to-small" mutants in the core of T4 lysozyme, has recently been strongly supported by related studies of the intrinsic membrane protein bacteriorhodopsin. The L99A cavity in the C-terminal domain of the protein, which readily binds benzene and many other ligands, has been the subject of extensive study. Crystallographic evidence, together with recent NMR analysis, suggest that these ligands are admitted by a conformational change involving Helix F and its neighbors. A total of 43 nonisomorphous crystal forms of different monomeric lysozyme mutants were obtained plus three more for synthetically-engineered dimers. Among the 43 space groups, P2(1)2(1)2(1) and P2(1) were observed most frequently, consistent with the prediction of Wukovitz and Yeates.},
author = {Baase, W A and Liu, L and Tronrud, D E and Matthews, B W},
journal = {Protein Sci},
misc = { pmid = {20095051},
doi = {10.1002/pro.344}},
month = {apr},
number = {4},
pages = {631-641},
title = {Lessons from the lysozyme of phage T4},
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2867005/},
volume = {19},
year = {2010}
}
@article{NunesAlves2018T4LysozymeUnbinding,
abstract = {The T4 lysozyme L99A mutant is often used as a model system to study small-molecule binding to proteins, but pathways for ligand entry and exit from the buried binding site and the associated protein conformational changes have not been fully resolved. Here, molecular dynamics simulations were employed to model benzene exit from its binding cavity using the weighted ensemble (WE) approach to enhance sampling of low-probability unbinding trajectories. Independent WE simulations revealed four pathways for benzene exit, which correspond to transient tunnels spontaneously formed in previous simulations of apo T4 lysozyme. Thus, benzene unbinding occurs through multiple pathways partially created by intrinsic protein structural fluctuations. Motions of several α-helices and side chains were involved in ligand escape from metastable microstates. WE simulations also provided preliminary estimates of rate constants for each exit pathway. These results complement previous works and provide a semiquantitative characterization of pathway heterogeneity for binding of small molecules to proteins.},
author = {Nunes-Alves, Ariane and Zuckerman, Daniel M. and Arantes, Guilherme Menegon},
journal = {Biophysical Journal},
misc = { issn = {0006-3495},
doi = {https://doi.org/10.1016/j.bpj.2018.01.014}},
number = {5},
pages = {1058 - 1066},
title = {Escape of a Small Molecule from Inside T4 Lysozyme by Multiple Pathways},
url = {http://www.sciencedirect.com/science/article/pii/S0006349518301401},
volume = {114},
year = {2018}
}
@article{Wang2016LysozymeMetadynamicsUnbinding,
author = {Wang, Yong and Papaleo, Elena and Lindorff-Larsen, Kresten},
journal = {Elife},
pages = {e17505},
publisher = {eLife Sciences Publications Limited},
title = {Mapping transiently formed and sparsely populated conformations on a complex energy landscape},
volume = {5},
year = {2016}
}
@article{Feher1996LysozymeBinding,
author = {Feher, Victoria A and Baldwin, Enoch P and Dahlquist, Frederick W},
journal = {Nature structural biology},
number = {6},
pages = {516},
publisher = {Nature Publishing Group},
title = {Access of ligands to cavities within the core of a protein is rapid},
volume = {3},
year = {1996}
}
#+END_SRC
** Simulations
:PROPERTIES:
:TABLE_EXPORT_FILE: data/sim_management/jobs.csv
:TABLE_EXPORT_FORMAT: orgtbl-to-csv
:END:
| gexp | span_id | jobid | segment_idx | success | status |
|-----------+---------+----------+-------------+---------+-----------|
| No | 0 | 49952117 | 0 | yes | COMPLETED |
| No | 1 | 49952120 | 0 | yes | COMPLETED |
| No | 2 | 49952123 | 0 | yes | COMPLETED |
| No | 3 | 50036078 | 0 | yes | COMPLETED |
| No_single | 0 | 50035864 | 0 | yes | COMPLETED |
| No_single | 1 | 50035866 | 0 | yes | COMPLETED |
| No_single | 2 | 50035867 | 0 | yes | COMPLETED |
| No_single | 3 | 50035868 | 0 | yes | COMPLETED |
| REVO | 0 | 49952118 | 0 | yes | COMPLETED |
| REVO | 1 | 49952121 | 0 | yes | COMPLETED |
| REVO | 2 | 49952124 | 0 | yes | COMPLETED |
| REVO | 3 | 50036052 | 0 | yes | COMPLETED |
| WExplore | 0 | 49952122 | 0 | yes | COMPLETED |
| WExplore | 1 | 49952125 | 0 | yes | COMPLETED |
| WExplore | 2 | 49952119 | 0 | yes | COMPLETED |
| WExplore | 3 | 50036053 | 0 | yes | COMPLETED |
|-----------+---------+----------+-------------+---------+-----------|
** Analysis
*** Span Stats
| | gexp | span_id | n_runs | n_warps | n_cycles | total_sampling_time_ps | warps_per_ps | rate_ps | total_warped_weight | warp_weight_mean | warp_weight_median | warp_weight_min | warp_weight_max | warp_wait-time_mean_ps | warp_wait-time_median_ps | warp_wait-time_min_ps | warp_wait-time_max_ps |
|----+-----------+-----------+----------+-----------+------------+--------------------------+----------------+-------------+-----------------------+--------------------+----------------------+-------------------+-------------------+--------------------------+----------------------------+-------------------------+-------------------------|
| 0 | No | 0 | 1 | 7 | 1101 | 1.05696e+06 | 6.62277e-06 | 1.37974e-07 | 0.145833 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 18317.1 | 17960 | 14580 | 21240 |
| 1 | No | 1 | 1 | 21 | 4645 | 4.4592e+06 | 4.70936e-06 | 9.81118e-08 | 0.4375 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 48672.4 | 41940 | 9760 | 92120 |
| 2 | No | 2 | 1 | 4 | 1107 | 1.06272e+06 | 3.76393e-06 | 7.84151e-08 | 0.0833333 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 14915 | 15830 | 6460 | 21540 |
| 3 | No | 3 | 1 | 7 | 1139 | 1.09344e+06 | 6.40181e-06 | 1.33371e-07 | 0.145833 | 0.0208333 | 0.0208333 | 0.0208333 | 0.0208333 | 18940 | 19580 | 11420 | 22580 |
| 4 | No_single | 0 | 1 | 2 | 42914 | 858280 | 2.33024e-06 | 2.33024e-06 | 2 | 1 | 1 | 1 | 1 | 759060 | 759060 | 748720 | 769400 |
| 5 | No_single | 1 | 1 | 2 | 41619 | 832380 | 2.40275e-06 | 2.40275e-06 | 2 | 1 | 1 | 1 | 1 | 730970 | 730970 | 659480 | 802460 |
| 6 | No_single | 2 | 1 | 4 | 42058 | 841160 | 4.75534e-06 | 4.75534e-06 | 4 | 1 | 1 | 1 | 1 | 120240 | 119320 | 44640 | 197680 |
| 7 | No_single | 3 | 1 | 3 | 42642 | 852840 | 3.51766e-06 | 3.51766e-06 | 3 | 1 | 1 | 1 | 1 | 510460 | 478380 | 449320 | 603680 |
| 8 | REVO | 0 | 1 | 1381 | 1142 | 1.09632e+06 | 0.00125967 | 3.6137e-08 | 0.0396177 | 2.86877e-05 | 5.29838e-11 | 1.00581e-12 | 0.0308164 | 13088 | 13760 | 1100 | 22780 |
| 9 | REVO | 1 | 1 | 1458 | 1126 | 1.08096e+06 | 0.0013488 | 3.22975e-09 | 0.00349123 | 2.39453e-06 | 1.01315e-10 | 1.01126e-12 | 0.00152529 | 14434 | 14900 | 320 | 22500 |
| 10 | REVO | 2 | 1 | 469 | 1103 | 1.05888e+06 | 0.000442921 | 3.23438e-13 | 3.42482e-07 | 7.3024e-10 | 5.8895e-12 | 1.0023e-12 | 2.54594e-08 | 12041.7 | 11660 | 340 | 21900 |
| 11 | REVO | 3 | 1 | 1029 | 1100 | 1.056e+06 | 0.000974432 | 5.1951e-09 | 0.00548602 | 5.33141e-06 | 1.69055e-11 | 1.00155e-12 | 0.00320306 | 10512.9 | 9380 | 160 | 21980 |
| 12 | WExplore | 0 | 1 | 70 | 1134 | 1.08864e+06 | 6.43004e-05 | 7.02838e-08 | 0.0765138 | 0.00109305 | 9.23911e-08 | 1.92453e-12 | 0.0387847 | 11624.3 | 11340 | 420 | 21720 |
| 13 | WExplore | 1 | 1 | 85 | 1129 | 1.08384e+06 | 7.84249e-05 | 1.58479e-07 | 0.171766 | 0.00202078 | 1.91101e-08 | 4.27336e-12 | 0.0670075 | 10480.9 | 9360 | 920 | 22320 |
| 14 | WExplore | 2 | 1 | 264 | 4585 | 4.4016e+06 | 5.99782e-05 | 8.50116e-09 | 0.0374187 | 0.000141738 | 1.07901e-07 | 1.04765e-12 | 0.0138983 | 47844.8 | 50220 | 1680 | 91560 |
| 15 | WExplore | 3 | 1 | 67 | 1095 | 1.0512e+06 | 6.37367e-05 | 3.54706e-07 | 0.372867 | 0.00556517 | 5.52999e-07 | 1.84583e-12 | 0.357752 | 11421.2 | 11540 | 320 | 21300 |
* Log
Log of activities
** <2019-11-13 Wed>
Notes for today...
** <2019-12-04 Wed>
Finally finished simulations and I know that there were some good
results from the dashboards.
Now considering what my next goals are.
I do want to do some exploratory stuff to make sure the integrity of
the simulations is good.
Then I can work on computing some observables and doing fe profiles.
But first I will have to set up all the data retrieval and contig
trees etc.
- [ ] data retrieval functions
- [ ] "experiment" keys
- resampler conditions
- [ ] contig trees
- [ ] resampling tree visualization
- [ ] terminal trajectories
- [ ] exit points
- [ ] end walkers
- [ ] warping
- [ ] stats table
- [ ] rate plots
- [ ] compute observables
- [ ] ligand RMSD
- [ ] protein RMSD
- [ ] SASA
- [ ] FE profile plots
* COMMENT Scrapyard
Things you don't want to throw away but you don't want to keep in the
clean sections above.
** Scratch
*** Example of running lysozyme
#+begin_src python
""" This is a simple script to run traditional molecular dynamics simulations """
import os
from simtk import openmm, unit
from simtk.openmm.app.pdbfile import PDBFile
from simtk.openmm.app import statedatareporter
from openmmtools import testsystems, integrators
from sys import argv
import timeit
try:
file_name = argv[1]
total_steps = int(argv[2])
frame_period = int(argv[3])
except IndexError:
print('Please enter three arguments: [file name] [number of steps] '
'[fraction of steps to print to dcd file]')
raise
try:
platform_input = argv[4]
except IndexError:
platform_input = 'CPU'
print('Inputs:'
'\nFile prefix:', file_name,
'\nTotal steps:', total_steps,
'\nFrame period:', frame_period,
'\nCPU or CUDA?:', platform_input)
dcd_file_name = file_name + '.dcd'
pdb_file_name = file_name + '.pdb'
reporter_file_name = file_name + '.out'
state_file_name = file_name + '_' + platform_input + '.bin'
lysozyme_pxylene = testsystems.LysozymeImplicit()
topology = lysozyme_pxylene.topology
t4_system = lysozyme_pxylene.system
# Use timestep of 2 fs
new_timestep = unit.quantity.Quantity(2, unit.femtosecond)
integrator = integrators.LangevinIntegrator(timestep=new_timestep)
platform = openmm.Platform.getPlatformByName(platform_input)
if platform_input == 'CUDA': # Settings for running on GPU's
prop = dict(CudaPrecision='single')
simulation = openmm.app.Simulation(topology, t4_system, integrator, platform, prop)
else: # Presumably running on CPU's
simulation = openmm.app.Simulation(topology, t4_system, integrator, platform)
if os.path.exists(dcd_file_name):
os.remove(dcd_file_name)
positions = lysozyme_pxylene.positions
# Only minimize the energy and save the updated atomic positions if it hasn't been done before
if not os.path.exists(pdb_file_name):
with open(pdb_file_name, 'w') as file:
# MDTraj is preferred for this...
PDBFile.writeFile(topology, positions, file=file)
if os.path.exists(state_file_name): # Load the saved checkpoint of the simulation
simulation.loadCheckpoint(state_file_name)
print('Reusing previous checkpoint')
else: # Otherwise re-setup the simulation
print('Creating new checkpoint')
simulation.context.setPositions(positions)
simulation.context.setVelocitiesToTemperature(300*unit.kelvin)
print('Minimizing...')
# First minimize the energy
simulation.minimizeEnergy()
with open(state_file_name, 'w') as file:
simulation.saveCheckpoint(state_file_name)
simulation.reporters.append(openmm.app.DCDReporter(dcd_file_name, frame_period))
simulation.reporters.append(
statedatareporter.StateDataReporter(reporter_file_name, frame_period, step=True, time=True,
potentialEnergy=True, temperature=True, progress=True, remainingTime=True,
speed=True, totalSteps=total_steps, separator='\t'
)
)
print('Running Production...')
start_time = timeit.default_timer()
simulation.step(total_steps)
finish_time = timeit.default_timer()
print('Done.'
'\nTotal steps:', total_steps,
'\nFrame period:', frame_period,
'\nSimulation runtime (seconds):', finish_time - start_time)
#+end_src
** Analysis
** Invocation
* COMMENT Local Variables
# Local Variables:
# mode: org
# org-todo-keyword-faces: (("TODO" . org-warning) ("INPROGRESS" . "magenta") ("WAIT" . "orange") ("DONE" . org-done) ("CANCELLED" . org-done))
# org-table-export-default-format: orgtbl-to-csv
# End: