data_analytics_with_python.md

marie@login$ srun --partition=haswell --gres=gpu:1 --ntasks=1 --cpus-per-task=7 --pty --mem-per-cpu=8000 bash
marie@haswell$ module load Python
marie@haswell$ python
Python 3.8.6 (default, Feb 17 2021, 11:48:51)
[GCC 10.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>>
marie@login$ srun --partition=haswell --cpus-per-task=4 --mem=2G --hint=nomultithread --pty --time=8:00:00 bash
```python
import pandas as pd
import numpy as np
from pandarallel import pandarallel

pandarallel.initialize()
# unfortunately the initialize method gets the total number of physical cores without
# taking into account allocated cores by Slurm, but the choice of the -c parameter is of relevance here

N_rows = 10**5
N_cols = 5
df = pd.DataFrame(np.random.randn(N_rows, N_cols))

# here some function that needs to be executed in parallel
def transform(x):
    return(np.mean(x))

print('calculate with normal apply...')
df.apply(func=transform, axis=1)

print('calculate with pandarallel...')
df.parallel_apply(func=transform, axis=1)
```
marie@compute$ module spider dask
------------------------------------------------------------------------------------------
    dask:
----------------------------------------------------------------------------------------------
    Versions:
        dask/2.8.0-fosscuda-2019b-Python-3.7.4
        dask/2.8.0-Python-3.7.4
        dask/2.8.0 (E)
[...]
marie@compute$ module load dask/2.8.0-fosscuda-2019b-Python-3.7.4
marie@compute$ python -c "import dask; print(dask.__version__)"
2021.08.1
marie@login$ srun -p ml -N 1 -n 1 --mem-per-cpu=5772 --gres=gpu:1 --time=04:00:00 --pty bash
marie@compute$ conda create --prefix /scratch/ws/0/marie-Workproject/conda-virtual-environment/dask-test python=3.6
marie@compute$ conda create -n dask-test python=3.6
marie@compute$ ml modenv/ml
marie@compute$ ml PythonAnaconda/3.6
marie@compute$ conda activate /scratch/ws/0/marie-Workproject/conda-virtual-environment/dask-test python=3.6
marie@compute$ which python
marie@compute$ which conda
marie@compute$ conda install dask
marie@compute$ python                            #start python
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=1)
client
marie@login$ srun -p ml -N 1 -n 1 --mem-per-cpu=5772 --gres=gpu:1 --time=04:00:00 --pty bash

marie@compute$ cd /scratch/ws/0/marie-Workproject/python-virtual-environment/dask-test

marie@compute$ ml modenv/ml
marie@compute$ module load PythonAnaconda/3.6
marie@compute$ which python

marie@compute$ python3 -m venv --system-site-packages dask-test
marie@compute$ source dask-test/bin/activate
marie@compute$ python -m pip install "dask[complete]"
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=1)
client
from dask.distributed import Client
client = Client(...)  # Connect to distributed cluster and override default
df.x.sum().compute()  # This now runs on the distributed system
marie@login$ srun -p haswell -N 1 -n 1 -c 4 --mem-per-cpu=2583 --time=01:00:00 --pty bash
marie@compute$ cd /scratch/ws/0/marie-Workproject/python-virtual-environment/dask-test
marie@compute$ ml modenv/ml
marie@compute$ module load PythonAnaconda/3.6
marie@compute$ which python

marie@compute$ source dask-test/bin/activate              #Activate virtual environment
marie@compute$ pip install dask-jobqueue --upgrade        #Install everything from last released version
marie@login$ srun -p haswell -N 1 -n 1 -c 4 --mem-per-cpu=2583 --time=01:00:00 --pty bash

marie@compute$ ml modenv/ml
marie@compute$ module load PythonAnaconda/3.6
marie@compute$ source dask-test/bin/activate

marie@compute$ conda install dask-jobqueue -c conda-forge
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(queue='alpha',
  cores=8,
  processes=2,
  project='p_marie',
  memory="8GB",
  walltime="00:30:00")

from distributed import Client
from dask_jobqueue import SLURMCluster
from dask import delayed

cluster = SLURMCluster(queue='alpha',
  cores=8,
  processes=2,
  project='p_scads',
  memory="80GB",
  walltime="00:30:00",
  extra=['--resources gpu=1'])

cluster.scale(2)             #scale it to 2 workers!
client = Client(cluster)     #command will show you number of workers (python objects corresponds to jobs)
marie@compute$ module load SciPy-bundle/2020.11-foss-2020b
Module SciPy-bundle/2020.11-foss-2020b and 28 dependencies loaded.
marie@compute$ pip list
Package                       Version
----------------------------- ----------
[...]
mpi4py                        3.0.3
[...]
marie@compute$ module spider mpi4py
-----------------------------------------------------------------------------------------------------------------------------------------
  mpi4py:
-----------------------------------------------------------------------------------------------------------------------------------------
     Versions:
        mpi4py/1.3.1
        mpi4py/2.0.0-impi
        mpi4py/3.0.0 (E)
        mpi4py/3.0.2 (E)
        mpi4py/3.0.3 (E)

Names marked by a trailing (E) are extensions provided by another module.

-----------------------------------------------------------------------------------------------------------------------------------------
  For detailed information about a specific "mpi4py" package (including how to load the modules) use the module's full name.
  Note that names that have a trailing (E) are extensions provided by other modules.
  For example:

     $ module spider mpi4py/3.0.3
-----------------------------------------------------------------------------------------------------------------------------------------
marie@login$ srun -p ml --time=04:00:00 -n 1 --pty --mem-per-cpu=8000 bash                            #allocate recources
marie@compute$ module load modenv/ml
marie@compute$ module load PythonAnaconda/3.6                                                         #load module to use conda
marie@compute$ conda create --prefix=<location_for_your_environment> python=3.6 anaconda              #create conda virtual environment

marie@compute$ conda activate <location_for_your_environment>                                       #activate your virtual environment

marie@compute$ conda install -c conda-forge mpi4py                                                  #install mpi4py

marie@compute$ python                                                                               #start python
from mpi4py import MPI
comm = MPI.COMM_WORLD
print("%d of %d" % (comm.Get_rank(), comm.Get_size()))
#!/bin/bash
#SBATCH --nodes=2
#SBATCH -p ml
#SBATCH --tasks-per-node=2
#SBATCH --cpus-per-task=1

module load modenv/ml
module load PythonAnaconda/3.6

eval "$(conda shell.bash hook)"
conda activate /home/marie/conda-virtual-environment/kernel2 && srun python mpi4py_test.py    #specify name of your virtual environment