From ae8f79444eef9a16fa7ade94339569a9eeabd47b Mon Sep 17 00:00:00 2001
From: Elias Werner <eliwerner3@googlemail.com>
Date: Tue, 14 Dec 2021 19:58:16 +0100
Subject: [PATCH] remove deprecated python virtual environment from Flink/Spark
 in Jupyter add flink in jupyter part update spark and flink jupyter notebook

---
 .../software/big_data_frameworks_spark.md     |  45 ++---
 doc.zih.tu-dresden.de/docs/software/flink.md  |  33 ++++
 .../docs/software/misc/FlinkExample.ipynb     | 159 ++++++++++++++++++
 .../docs/software/misc/SparkExample.ipynb     |  45 +++--
 4 files changed, 238 insertions(+), 44 deletions(-)
 create mode 100644 doc.zih.tu-dresden.de/docs/software/misc/FlinkExample.ipynb

diff --git a/doc.zih.tu-dresden.de/docs/software/big_data_frameworks_spark.md b/doc.zih.tu-dresden.de/docs/software/big_data_frameworks_spark.md
index 84f5935a1..81a66719d 100644
--- a/doc.zih.tu-dresden.de/docs/software/big_data_frameworks_spark.md
+++ b/doc.zih.tu-dresden.de/docs/software/big_data_frameworks_spark.md
@@ -158,46 +158,25 @@ example below:
 You can run Jupyter notebooks with Spark on the ZIH systems in a similar way as described on the
 [JupyterHub](../access/jupyterhub.md) page.
 
-### Preparation
 
-If you want to run Spark in Jupyter notebooks, you have to prepare it first. This is comparable
-to [normal Python virtual environments](../software/python_virtual_environments.md#python-virtual-environment).
-You start with an allocation:
-
-```console
-marie@login$ srun --pty --ntasks=1 --cpus-per-task=2 --mem-per-cpu=2500 --time=01:00:00 bash -l
-```
+### Spawning a Notebook
 
-When a node is allocated, install the required packages:
+Go to [https://taurus.hrsk.tu-dresden.de/jupyter](https://taurus.hrsk.tu-dresden.de/jupyter).
+In the tab "Advanced", go to the field "Preload modules" and select the following Spark module:
 
-```console
-marie@compute$ cd $HOME
-marie@compute$ mkdir jupyter-kernel
-marie@compute$ module load Python
-marie@compute$ virtualenv --system-site-packages jupyter-kernel/env  #Create virtual environment
-[...]
-marie@compute$ source jupyter-kernel/env/bin/activate    #Activate virtual environment.
-(env) marie@compute$ pip install ipykernel
-[...]
-(env) marie@compute$ python -m ipykernel install --user --name haswell-py3.7-spark --display-name="haswell-py3.7-spark"
-Installed kernelspec haswell-py3.7-spark in [...]
-
-(env) marie@compute$ pip install findspark
-(env) marie@compute$ deactivate
+```
+Spark/3.0.1-Hadoop-2.7-Java-1.8-Python-3.7.4-GCCcore-8.3.0
 ```
 
-You are now ready to spawn a notebook with Spark.
+When your Jupyter instance is started, you can set up Spark. Since the setup in the notebook 
+requires more steps than in an interactive session, we have created an example notebook that you can
+use as a starting point for convenience: [SparkExample.ipynb](misc/SparkExample.ipynb)
 
-### Spawning a Notebook
+!!! warning
+
+    This notebook only works with the Spark module mentioned above. When using other Spark modules,
+    it is possible that you have to do additional or other steps in order to make Spark running.
 
-Assuming that you have prepared everything as described above, you can go to
-[https://taurus.hrsk.tu-dresden.de/jupyter](https://taurus.hrsk.tu-dresden.de/jupyter).
-In the tab "Advanced", go to the field "Preload modules" and select one of the Spark modules. When
-your Jupyter instance is started, check whether the kernel that you created in the preparation
-phase (see above) is shown in the top right corner of the notebook. If it is not already selected,
-select the kernel `haswell-py3.7-spark`. Then, you can set up Spark. Since the setup in the
-notebook requires more steps than in an interactive session, we have created an example notebook
-that you can use as a starting point for convenience: [SparkExample.ipynb](misc/SparkExample.ipynb)
 
 !!! note
 
diff --git a/doc.zih.tu-dresden.de/docs/software/flink.md b/doc.zih.tu-dresden.de/docs/software/flink.md
index 05fb403c2..b20d340fd 100644
--- a/doc.zih.tu-dresden.de/docs/software/flink.md
+++ b/doc.zih.tu-dresden.de/docs/software/flink.md
@@ -158,6 +158,39 @@ example below:
     [workspaces](../data_lifecycle/workspaces.md) for your study and work projects**. For this
     reason, you have to use advanced options of Jupyterhub and put "/" in "Workspace scope" field.
 
+    
+## Jupyter Notebook
+
+You can run Jupyter notebooks with Flink on the ZIH systems in a similar way as described on the
+[JupyterHub](../access/jupyterhub.md) page.
+
+
+### Spawning a Notebook
+
+Go to [https://taurus.hrsk.tu-dresden.de/jupyter](https://taurus.hrsk.tu-dresden.de/jupyter).
+In the tab "Advanced", go to the field "Preload modules" and select the following Flink module:
+
+```
+Flink/1.12.3-Java-1.8.0_161-OpenJDK-Python-3.7.4-GCCcore-8.3.0
+```
+    
+When your Jupyter instance is started, you can set up Flink. Since the setup in the notebook 
+requires more steps than in an interactive session, we have created an example notebook that you can
+use as a starting point for convenience: [FlinkExample.ipynb](misc/FlinkExample.ipynb)
+
+!!! warning
+
+    This notebook only works with the Flink module mentioned above. When using other Flink modules,
+    it is possible that you have to do additional or other steps in order to make Flink running.
+    
+!!! note
+
+    You could work with simple examples in your home directory, but, according to the
+    [storage concept](../data_lifecycle/overview.md), **please use
+    [workspaces](../data_lifecycle/workspaces.md) for your study and work projects**. For this
+    reason, you have to use advanced options of Jupyterhub and put "/" in "Workspace scope" field.
+
+    
 ## FAQ
 
 Q: Command `source framework-configure.sh hadoop
diff --git a/doc.zih.tu-dresden.de/docs/software/misc/FlinkExample.ipynb b/doc.zih.tu-dresden.de/docs/software/misc/FlinkExample.ipynb
new file mode 100644
index 000000000..5a867b875
--- /dev/null
+++ b/doc.zih.tu-dresden.de/docs/software/misc/FlinkExample.ipynb
@@ -0,0 +1,159 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "!{sys.executable} -m pip install apache-flink --user"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "echo $FLINK_ROOT_DIR\n",
+    "echo $JAVA_HOME\n",
+    "hostname\n",
+    "if [ ! -d $HOME/jupyter-flink-conf ]\n",
+    "then\n",
+    "cp -r $FLINK_ROOT_DIR/conf $HOME/jupyter-flink-conf\n",
+    "chmod -R u+w $HOME/jupyter-flink-conf\n",
+    "fi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "os.environ['FLINK_CONF_DIR'] = os.environ['HOME'] + '/cluster-conf-' + os.environ['SLURM_JOBID'] + '/flink'\n",
+    "os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + ':' + os.environ['HOME'] + '/.local/lib/python3.6/site-packages'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!SHELL=/bin/bash bash framework-configure.sh flink $HOME/jupyter-flink-conf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exitcode = os.system('start-cluster.sh')\n",
+    "if not exitcode:\n",
+    "    print(\"started Flink cluster successful\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "echo \"This is a short story for you. In this story nothing is happening. Have a nice day!\" > myFlinkTestFile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyflink.datastream import StreamExecutionEnvironment\n",
+    "from pyflink.datastream.connectors import FileSource\n",
+    "from pyflink.datastream.connectors import StreamFormat\n",
+    "from pyflink.common.watermark_strategy import WatermarkStrategy\n",
+    "from pyflink.common.typeinfo import Types\n",
+    "\n",
+    "env = StreamExecutionEnvironment.get_execution_environment()\n",
+    "env.set_parallelism(2)\n",
+    "#set the Python executable for the workers\n",
+    "env.set_python_executable(sys.executable)\n",
+    "# define the source\n",
+    "ds = env.from_source(source=FileSource.for_record_stream_format(StreamFormat.text_line_format(),\n",
+    "                                               \"myFlinkTestFile\").process_static_file_set().build(),\n",
+    "                     watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),\n",
+    "                     source_name=\"file_source\")\n",
+    "\n",
+    "def split(line):\n",
+    "    yield from line.split()\n",
+    "\n",
+    "    \n",
+    "# compute word count\n",
+    "ds = ds.flat_map(split) \\\n",
+    "    .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \\\n",
+    "    .key_by(lambda i: i[0]) \\\n",
+    "    .reduce(lambda i, j: (i[0], i[1] + j[1])) \\\n",
+    "    .map(lambda i: print(i))\n",
+    "\n",
+    "# submit for execution\n",
+    "env.execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "stop-cluster.sh"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ps -ef | grep -i java"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pkill -f \"java\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/doc.zih.tu-dresden.de/docs/software/misc/SparkExample.ipynb b/doc.zih.tu-dresden.de/docs/software/misc/SparkExample.ipynb
index 67eb37e89..959b536b8 100644
--- a/doc.zih.tu-dresden.de/docs/software/misc/SparkExample.ipynb
+++ b/doc.zih.tu-dresden.de/docs/software/misc/SparkExample.ipynb
@@ -1,5 +1,24 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "!{sys.executable} -m pip install findspark --user"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!which python"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -27,7 +46,8 @@
     "import sys\n",
     "import os\n",
     "os.environ['PYSPARK_PYTHON'] = sys.executable\n",
-    "os.environ['SPARK_CONF_DIR'] = os.environ['HOME'] + '/cluster-conf-' + os.environ['SLURM_JOBID'] + '/spark'"
+    "os.environ['SPARK_CONF_DIR'] = os.environ['HOME'] + '/cluster-conf-' + os.environ['SLURM_JOBID'] + '/spark'\n",
+    "os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + ':' + os.environ['HOME'] + '/.local/lib/python3.6/site-packages'"
    ]
   },
   {
@@ -48,6 +68,16 @@
     "!start-all.sh"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import findspark\n",
+    "findspark.init(os.environ['SPARK_HOME'])"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -116,20 +146,13 @@
    "source": [
     "!pkill -f \"pyspark-shell\""
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "haswell-py3.7-spark",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "haswell-py3.7-spark"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -141,7 +164,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
-- 
GitLab