diff --git a/doc.zih.tu-dresden.de/docs/software/big_data_frameworks.md b/doc.zih.tu-dresden.de/docs/software/big_data_frameworks.md index 3375f599f68e04344f4497fcf1fe3f43f45fbbd4..df7fc8b56a8a015b5a13a8c871b5163b2c1d473d 100644 --- a/doc.zih.tu-dresden.de/docs/software/big_data_frameworks.md +++ b/doc.zih.tu-dresden.de/docs/software/big_data_frameworks.md @@ -32,7 +32,6 @@ The steps are: Apache Spark can be used in [interactive](#interactive-jobs) and [batch](#batch-jobs) jobs as well as via [Jupyter notebooks](#jupyter-notebook). All three ways are outlined in the following. -The usage of Flink with Jupyter notebooks is currently under examination. ## Interactive Jobs @@ -238,50 +237,34 @@ example below: ## Jupyter Notebook -You can run Jupyter notebooks with Spark on the ZIH systems in a similar way as described on the -[JupyterHub](../access/jupyterhub.md) page. Interaction of Flink with JupyterHub is currently -under examination and will be posted here upon availability. +You can run Jupyter notebooks with Spark and Flink on the ZIH systems in a similar way as described +on the [JupyterHub](../access/jupyterhub.md) page. -### Preparation - -If you want to run Spark in Jupyter notebooks, you have to prepare it first. This is comparable -to [normal Python virtual environments](../software/python_virtual_environments.md#python-virtual-environment). -You start with an allocation: - -```console -marie@login$ srun --pty --ntasks=1 --cpus-per-task=2 --mem-per-cpu=2500 --time=01:00:00 bash -l -``` +### Spawning a Notebook -When a node is allocated, install the required packages: +Go to [https://taurus.hrsk.tu-dresden.de/jupyter](https://taurus.hrsk.tu-dresden.de/jupyter). +In the tab "Advanced", go to the field "Preload modules" and select the following Spark or Flink +module: -```console -marie@compute$ cd $HOME -marie@compute$ mkdir jupyter-kernel -marie@compute$ module load Python -marie@compute$ virtualenv --system-site-packages jupyter-kernel/env #Create virtual environment -[...] -marie@compute$ source jupyter-kernel/env/bin/activate #Activate virtual environment. -(env) marie@compute$ pip install ipykernel -[...] -(env) marie@compute$ python -m ipykernel install --user --name haswell-py3.7-spark --display-name="haswell-py3.7-spark" -Installed kernelspec haswell-py3.7-spark in [...] - -(env) marie@compute$ pip install findspark -(env) marie@compute$ deactivate -``` +=== "Spark" + ``` + Spark/3.0.1-Hadoop-2.7-Java-1.8-Python-3.7.4-GCCcore-8.3.0 + ``` +=== "Flink" + ``` + Flink/1.12.3-Java-1.8.0_161-OpenJDK-Python-3.7.4-GCCcore-8.3.0 + ``` -You are now ready to spawn a notebook with Spark. +When your Jupyter instance is started, you can set up Spark/Flink. Since the setup in the notebook +requires more steps than in an interactive session, we have created example notebooks that you can +use as a starting point for convenience: [SparkExample.ipynb](misc/SparkExample.ipynb), +[FlinkExample.ipynb](misc/FlinkExample.ipynb) -### Spawning a Notebook +!!! warning -Assuming that you have prepared everything as described above, you can go to -[https://taurus.hrsk.tu-dresden.de/jupyter](https://taurus.hrsk.tu-dresden.de/jupyter). -In the tab "Advanced", go to the field "Preload modules" and select one of the Spark modules. When -your Jupyter instance is started, check whether the kernel that you created in the preparation -phase (see above) is shown in the top right corner of the notebook. If it is not already selected, -select the kernel `haswell-py3.7-spark`. Then, you can set up Spark. Since the setup in the -notebook requires more steps than in an interactive session, we have created an example notebook -that you can use as a starting point for convenience: [SparkExample.ipynb](misc/SparkExample.ipynb) + The notebooks only work with the Spark or Flink module mentioned above. When using other + Spark/Flink modules, it is possible that you have to do additional or other steps in order to + make Spark/Flink running. !!! note diff --git a/doc.zih.tu-dresden.de/docs/software/misc/FlinkExample.ipynb b/doc.zih.tu-dresden.de/docs/software/misc/FlinkExample.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..5a867b8750704ea92a318087d82bb0ca3355018d --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/software/misc/FlinkExample.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "!{sys.executable} -m pip install apache-flink --user" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "echo $FLINK_ROOT_DIR\n", + "echo $JAVA_HOME\n", + "hostname\n", + "if [ ! -d $HOME/jupyter-flink-conf ]\n", + "then\n", + "cp -r $FLINK_ROOT_DIR/conf $HOME/jupyter-flink-conf\n", + "chmod -R u+w $HOME/jupyter-flink-conf\n", + "fi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "os.environ['FLINK_CONF_DIR'] = os.environ['HOME'] + '/cluster-conf-' + os.environ['SLURM_JOBID'] + '/flink'\n", + "os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + ':' + os.environ['HOME'] + '/.local/lib/python3.6/site-packages'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!SHELL=/bin/bash bash framework-configure.sh flink $HOME/jupyter-flink-conf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exitcode = os.system('start-cluster.sh')\n", + "if not exitcode:\n", + " print(\"started Flink cluster successful\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "echo \"This is a short story for you. In this story nothing is happening. Have a nice day!\" > myFlinkTestFile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyflink.datastream import StreamExecutionEnvironment\n", + "from pyflink.datastream.connectors import FileSource\n", + "from pyflink.datastream.connectors import StreamFormat\n", + "from pyflink.common.watermark_strategy import WatermarkStrategy\n", + "from pyflink.common.typeinfo import Types\n", + "\n", + "env = StreamExecutionEnvironment.get_execution_environment()\n", + "env.set_parallelism(2)\n", + "#set the Python executable for the workers\n", + "env.set_python_executable(sys.executable)\n", + "# define the source\n", + "ds = env.from_source(source=FileSource.for_record_stream_format(StreamFormat.text_line_format(),\n", + " \"myFlinkTestFile\").process_static_file_set().build(),\n", + " watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),\n", + " source_name=\"file_source\")\n", + "\n", + "def split(line):\n", + " yield from line.split()\n", + "\n", + " \n", + "# compute word count\n", + "ds = ds.flat_map(split) \\\n", + " .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \\\n", + " .key_by(lambda i: i[0]) \\\n", + " .reduce(lambda i, j: (i[0], i[1] + j[1])) \\\n", + " .map(lambda i: print(i))\n", + "\n", + "# submit for execution\n", + "env.execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "stop-cluster.sh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!ps -ef | grep -i java" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pkill -f \"java\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/doc.zih.tu-dresden.de/docs/software/misc/SparkExample.ipynb b/doc.zih.tu-dresden.de/docs/software/misc/SparkExample.ipynb index 67eb37e898667946a0a6dbdf60bc104908e12601..959b536b85dd3d5d01c79217b697506a7517d4f3 100644 --- a/doc.zih.tu-dresden.de/docs/software/misc/SparkExample.ipynb +++ b/doc.zih.tu-dresden.de/docs/software/misc/SparkExample.ipynb @@ -1,5 +1,24 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "!{sys.executable} -m pip install findspark --user" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!which python" + ] + }, { "cell_type": "code", "execution_count": null, @@ -27,7 +46,8 @@ "import sys\n", "import os\n", "os.environ['PYSPARK_PYTHON'] = sys.executable\n", - "os.environ['SPARK_CONF_DIR'] = os.environ['HOME'] + '/cluster-conf-' + os.environ['SLURM_JOBID'] + '/spark'" + "os.environ['SPARK_CONF_DIR'] = os.environ['HOME'] + '/cluster-conf-' + os.environ['SLURM_JOBID'] + '/spark'\n", + "os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + ':' + os.environ['HOME'] + '/.local/lib/python3.6/site-packages'" ] }, { @@ -48,6 +68,16 @@ "!start-all.sh" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import findspark\n", + "findspark.init(os.environ['SPARK_HOME'])" + ] + }, { "cell_type": "code", "execution_count": null, @@ -116,20 +146,13 @@ "source": [ "!pkill -f \"pyspark-shell\"" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "haswell-py3.7-spark", + "display_name": "Python 3", "language": "python", - "name": "haswell-py3.7-spark" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -141,7 +164,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.6.10" } }, "nbformat": 4, diff --git a/doc.zih.tu-dresden.de/wordlist.aspell b/doc.zih.tu-dresden.de/wordlist.aspell index ef6dc3d94888a5c6fe6ef99394d3e03d63e85df6..262f5eeae1b153648d59418137b8bac2dc2cf5fb 100644 --- a/doc.zih.tu-dresden.de/wordlist.aspell +++ b/doc.zih.tu-dresden.de/wordlist.aspell @@ -87,6 +87,7 @@ filesystem filesystems flink Flink +FlinkExample FMA foreach Fortran @@ -145,6 +146,7 @@ inode Instrumenter IOPS IPs +ipynb ISA Itanium jobqueue @@ -322,6 +324,7 @@ Slurm SLURMCluster SMP SMT +SparkExample spython squeue srun