Update auto converted files to latest

6a30f5ca · Alexander Dunkel · 3a8139ab · 6a30f5ca · 6a30f5ca
Unverified Commit 6a30f5ca authored 2 years ago by Alexander Dunkel
--- a/notebooks/intro.ipynb
+++ b/notebooks/intro.ipynb
--- a/py/_intro.py
+++ b/py/_intro.py
@@ -20,7 +20,7 @@
 #     <div style="width:256px;text-align:right;margin-top:0px;margin-right:10px"><a href="https://gitlab.hrz.tu-chemnitz.de/tud-ifk/sunsetsunrise-example-benchmarkdata"><img src="https://kartographie.geo.tu-dresden.de/ad/sunsetsunrise-demo/version.svg"></a></div>
 # </div>
-# + tags=["hide_code"]
+# + tags=["hide_code"] jupyter={"source_hidden": true}
 from IPython.display import Markdown as md
 from datetime import date
@@ -29,7 +29,7 @@ with open('/.version', 'r') as file: app_version = file.read().split("'")[1]
 md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitlab.vgiscience.de/lbsn/tools/jupyterlab) Version {app_version}")
 # -
-# This notebook shows how to load and use the HyperLogLog benchmark data shared in
+# This notebook demonstrates how to load and use the HyperLogLog benchmark data shared in
 #
 # > Dunkel, A., Hartmann, M. C., Hauthal, E., Burghardt, Dirk, & Purves, R. S. (2023).
 #   From sunrise to sunset: Exploring landscape preference through global reactions to
@@ -38,10 +38,10 @@ md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitl
 # The data is available in a [repository](https://doi.org/10.25532/OPARA-200)
 #
 # <div class="alert alert-success" role="alert" style="color: black;">
-#     <details><summary style="cursor: pointer;"><strong>If you are unfamiliar with HLL</strong></summary>
+#     <details><summary style="cursor: pointer;"><strong>Unfamiliar with HLL?</strong></summary>
 #     <ul>
 #         <li>A basic introduction to working with HLL data is provided <a href="https://kartographie.geo.tu-dresden.de/python_datascience_course/02_hll_intro.html">in this jupyter notebook</a></li>
-#         <li>The scoping study for the sunset-sunrise paper was based on the Flickr YFCC 100M dataset, and the corresponding Notebooks can be found <a href="https://gitlab.vgiscience.de/ad/yfcc_gridagg">here</a></li>
+#         <li>The scoping study for the sunset-sunrise paper was based on the Flickr YFCC 100M dataset, and the corresponding notebooks can be found <a href="https://gitlab.vgiscience.de/ad/yfcc_gridagg">here</a></li>
 #         <li>The notebooks for the sunset-sunrise article can be found <a href="https://gitlab.vgiscience.de/ad/sunset-sunrise-paper">here</a></li>
 #     </ul>
 #     </details>
@@ -67,7 +67,9 @@ md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitl
 # # cd jupyterlab
 # # cp .env.example .env
 # nano .env
+# ## Enter:
 # # JUPYTER_NOTEBOOKS=~/notebooks/sunsetsunrise-demo
+# # TAG=v0.12.3
 # docker network create lbsn-network
 # docker-compose pull && docker-compose up -d
 # ```
@@ -89,7 +91,7 @@ md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitl
 # </details>
 # </div>
-# + tags=["hide_code"]
+# + tags=["hide_code"] jupyter={"source_hidden": true}
 import sys
 from pathlib import Path
@@ -108,40 +110,29 @@ tools.package_report(root_packages)
 # Load dependencies:
+# + tags=[]
 import os
-import csv
-import sys
-import colorcet
 import psycopg2 # Postgres API
-import mapclassify as mc
 import geopandas as gp
 import pandas as pd
-import numpy as np
 import matplotlib.pyplot as plt
-import geoviews.feature as gf
-from collections import namedtuple
 from typing import List, Tuple, Dict, Optional
 from pyproj import Transformer, CRS, Proj
-from shapely.geometry import shape, Point, Polygon
-from shapely.ops import transform
-from cartopy import crs
-from matplotlib import colors
 from IPython.display import clear_output, display, HTML
-# optionally, enable shapely.speedups 
+# -
-# which makes some of the spatial 
-# queries running faster
-import shapely.speedups as speedups
-from shapely import geometry
 # Activate autoreload of changed python files:
+# + tags=[]
 # %load_ext autoreload
 # %autoreload 2
+# -
 # ## Parameters
 #
 # Define initial parameters that affect processing
+# + tags=[]
 WORK_DIR = Path.cwd().parents[0] / "tmp"
 """Working diretcory"""
 EPSG_CODE = 54009
@@ -154,125 +145,774 @@ CRS_WGS = "epsg:4326"
 OUTPUT = Path.cwd().parents[0] / "out"
 """Define path to output directory (figures etc.)""";
+# + tags=[]
 WORK_DIR.mkdir(exist_ok=True)
+# -
 # ## Load benchmark data
+# + tags=[]
 source_zip="https://opara.zih.tu-dresden.de/xmlui/bitstream/handle/123456789/5793/S10.zip?sequence=1&isAllowed=y"
+# + tags=[]
 if not (WORK_DIR / "flickr-all.csv").exists():
    tools.get_zip_extract(uri=source_zip, filename="S10.zip", output_path=WORK_DIR)
+# -
 # Show the directory tree:
+# + tags=[]
 tools.tree(Path.cwd().parents[0])
+# -
 # ## Load data
 # + tags=[]
 ALL_FLICKR = WORK_DIR / "flickr_all_hll.csv"
-# -
+# + tags=[]
 dtypes = {'latitude': float, 'longitude': float}
+# + tags=[]
 df = pd.read_csv(ALL_FLICKR, dtype=dtypes, encoding='utf-8')
+# + tags=[]
 df.head()
+# -
-# ## Use GPT3 to summarize content
+# ## Test HLL
-# Load API key
+# First, let's test the slower Python implemention [python-hll](https://github.com/AdRoll/python-hll), to get a cardinality for some of the original HLL strings. Cardinality means the estimated count of distinct items in a set, which refers to the User Count in our case.
+# + tags=[]
+from python_hll.util import NumberUtil
+from python_hll.hll import HLL
+# + tags=[]
+def hll_from_byte(hll_set: str):
+    """Return HLL set from binary representation"""
+    hex_string = hll_set[2:]
+    return HLL.from_bytes(
+        NumberUtil.from_hex(
+            hex_string, 0, len(hex_string)))
+# -
+# Have a look at the first HLL set:
+# + tags=[]
+df["user_hll"][0]
+# -
+# Cast to HLL and calculate cardinality in one step:
+# + tags=[]
+hll_from_byte(df["user_hll"][0]).cardinality() - 1
+# -
+# An estimated number of 12 users has been observed at location `0.040050,-179.750586` (lat, lng).
+#
+# These latitude and longitude coordinates refer to the centroids of the original 50 km grid.
+# ## Test Postgres HLL
+# In order to speed up processing, we can connect to a Postgres Database running the faster postgresql-hll extension from citus.
+#
+# Password and username for connecting to local [hllworker](https://gitlab.vgiscience.de/lbsn/databases/pg-hll-empty) are loaded from environment.
+# + tags=[]
+DB_USER = "hlluser"
+DB_PASS = os.getenv('READONLY_USER_PASSWORD')
+# set connection variables
+DB_HOST = "hllworkerdb"
+DB_PORT = "5432"
+DB_NAME = "hllworkerdb"
+# -
+# Connect to empty Postgres database running HLL Extension:
+# + tags=[]
+DB_CONN = psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT ,
+        dbname=DB_NAME,
+        user=DB_USER,
+        password=DB_PASS
+)
+DB_CONN.set_session(
+    readonly=True)
+DB_CALC = tools.DbConn(
+    DB_CONN)
+CUR_HLL = DB_CONN.cursor()
+# -
+# Test connection:
+# + tags=[]
+CUR_HLL.execute("SELECT 1;")
+print(CUR_HLL.statusmessage)
+# -
+# Test with actuall HLL set calculation:
+# + tags=[]
+db_query = f"""
+SELECT lat, lng, hll_cardinality(user_hll)::int as usercount 
+FROM (VALUES
+          ('{df["latitude"][0]}', '{df["longitude"][0]}', '{df["user_hll"][0]}'::hll)
+        ) s(lat, lng, user_hll)
+"""
+# + tags=[]
+df2 = DB_CALC.query(db_query)
+# + tags=[]
+df2.head()
+# -
+# Store this as two methods, to be used later.
+# + tags=[]
+def union_hll(hll_sets: List[str], cur_hll = CUR_HLL) -> str:
+    "Union two or more HLL sets and return the combined HLL set"
+    hll_values = ", ".join(f"('{hll_set}'::hll)" for hll_set in hll_sets)
+    db_query = f"""
+        SELECT hll_union_agg(s.hll_set)
+        FROM (
+        VALUES
+            {hll_values}
+        ) s(hll_set);
+    """
+    CUR_HLL.execute(db_query)
+    # return results as first from tuple
+    return CUR_HLL.fetchone()[0]
+def cardinality_hll(hll_set) -> int:
+    "Calculate the cardinality of a HLL set and return as int"
+    CUR_HLL.execute(f"SELECT hll_cardinality('{hll_set}')::int;")
+    return int(CUR_HLL.fetchone()[0])
+# + tags=[]
+results = union_hll([df["user_hll"][0], df["user_hll"][1]])
+results
+# + tags=[]
+cardinality_hll(results)
+# -
+# ## Summarize distinct user per continent
+# Now that we verified individual HLLs, we will 
+#
+# 1. load the full list of HLL sets for all coordinates
+# 2. assign coordinates to continents
+# 3. union all hll sets per continent, to retrieve the combined number of estimated distinct users having visited each continent on Flickr
+# ### Load continent geometries
+# + tags=[]
+world = gp.read_file(
+    gp.datasets.get_path('naturalearth_lowres'),
+    crs=CRS_WGS)
+world = world.to_crs(CRS_PROJ)
+# + tags=[]
+world = world[['continent', 'geometry']]
+# + tags=[]
+ax = world.plot(column='continent')
+ax.set_axis_off()
+# -
+# There are slight inaccuracies in Continent Geometries (overlapping polygons), which can be fixed by a small buffer, before dissolving country geometries:
+# + tags=[]
+world['geometry'] = world.buffer(0.01)
+# + tags=[]
+continents = world.dissolve(by='continent')
+# + tags=[]
+continents.head()
+# -
+# Assign index back as a column, to classify via color by `column='continent'`
+# + tags=[]
+continents['continent'] = continents.index.get_level_values(0)
+# + tags=[]
+ax = continents.plot(column='continent')
+ax.set_axis_off()
+# -
+# ### Create a GeoDataFrame from HLL coordinates
+# + tags=[]
+gdf = gp.GeoDataFrame(
+    df, geometry=gp.points_from_xy(df.longitude, df.latitude))
+gdf.crs=CRS_WGS
+gdf = gdf.to_crs(CRS_PROJ)
+# -
+# Define pyproj Transformer ahead of time with xy-order of coordinates, for future projections between WGS1984 and EPSG 54009 (Mollweide)
+# + tags=[]
+PROJ_TRANSFORMER = Transformer.from_crs(
+    CRS_WGS, CRS_PROJ, always_xy=True)
+# also define reverse projection
+PROJ_TRANSFORMER_BACK = Transformer.from_crs(
+    CRS_PROJ, CRS_WGS, always_xy=True)
+# + tags=[]
+gdf.set_index(['latitude', 'longitude'], inplace=True)
+# + tags=[]
+gdf.head()
+# -
+# ### Calculate Cardinality
+# Define a preview area (Italy)
+# + tags=[]
+bbox_eu = (
+    6.8662109375, 35.24427318493909,
+    22.31396484375, 45.29320031385282)
+# create bounds from WGS1984 italy and project to Mollweide
+minx, miny = PROJ_TRANSFORMER.transform(
+    bbox_eu[0], bbox_eu[1])
+maxx, maxy = PROJ_TRANSFORMER.transform(
+    bbox_eu[2], bbox_eu[3])
+# + tags=[]
+gdf_italy = gdf.cx[minx:maxx, miny:maxy]
+# + tags=[]
+# %%time
+cardinality_series = tools.hll_series_cardinality(gdf_italy["user_hll"], db_conn=DB_CALC)
+# + tags=[]
+cardinality_series.head()
+# + tags=[]
+gdf.loc[
+    cardinality_series.index,
+    "usercount"] = cardinality_series
+# -
+# Preview HLL coordinates and cardinality:
+# + tags=[]
+fig, ax = plt.subplots(1, 1)
+ax.set_xlim(minx, maxx)
+ax.set_ylim(miny, maxy)
+plot_kwd = {
+    'column':'usercount',
+    'legend':True, 
+    'markersize':25,
+    'cmap':'Reds', 
+    'scheme':'quantiles',
+    'legend_kwds':{'bbox_to_anchor': (1, 1), 'frameon':False, 'markerscale':0.7}
+    }
+base = gdf.cx[minx:maxx, miny:maxy].plot(
+    ax=ax, **plot_kwd)
+world.plot(
+    ax=base, color='none', edgecolor='black', linewidth=0.5)
+ax.set_axis_off()
+# -
+# ### Union HLL sets per continent
+# The last step is to union HLL sets per continent and calculate the combined cardinality.
+# + tags=[]
+# %%time
+gdf_overlay = gp.overlay(
+    gdf, world, 
+    how='intersection')
+gdf_overlay.set_index("continent", inplace=True)
+# + tags=[]
+gdf_overlay.head()
+# + tags=[]
+# %%time
+cardinality_series = tools.union_hll_series(
+    gdf_overlay["user_hll"], db_conn=DB_CALC, cardinality=True)
+# -
+# Below we can see the combined count distinct of users on Flickr per continent.
+# + tags=[]
+cardinality_series.sort_values(ascending=False).head(10)
+# -
+# Assign cardinality back to continents Geodataframe
+# + tags=[]
+continents.loc[
+    cardinality_series.index,
+    "usercount"] = cardinality_series.values
+# + tags=[]
+plot_kwd['legend_kwds'] = {'bbox_to_anchor': (1.4, 1), 'frameon':False}
+ax = continents.plot(**plot_kwd)
+ax.set_axis_off()
+# -
+# ## Intersection example
+# It is also possible to intersect HLL sets, to some degree. This can be used to estimate common visitor counts for (e.g.) countries. The average error, however, will be larger, especially for intersections of HLL sets for different sizes.
+# + tags=[]
+world = gp.read_file(
+    gp.datasets.get_path('naturalearth_lowres'),
+    crs=CRS_WGS)
+world = world.to_crs(
+    CRS_PROJ)
+# -
+# Select geometry for DE, FR and UK
+# + tags=[]
+geom1 = world[world['name'] == "Germany"]
+geom2 = world[world['name'] == "France"]
+geom3 = world[world['name'] == "United Kingdom"]
+# -
+# <div class="alert alert-warning" role="alert" style="color: black;">
+#     <details><summary style="cursor: pointer;"><strong>Select different countries</strong></summary>
+#     <ul>
+#         <li>Optionally: Modify the list of countries to adapt visualizations below.</li>
+#     </ul>
+# Example:
+# <pre lang="python">
+# geom1 = world[world['name'] == "Germany"]
+# geom2 = world[world['name'] == "United States of America"]
+# geom3 = world[world['name'] == "Canada"]
+# </pre>
+# ----
+# <pre lang="python">
+# ref_a = "de"
+# ref_b = "us"
+# ref_c = "ca"
+# name_ref = {
+#     ref_a:"Germany",
+#     ref_b:"USA",
+#     ref_c:"Canada"}
+# </pre>
+#     </details>
+# </div>
+# + tags=[]
+ref_a = "de"
+ref_b = "fr"
+ref_c = "uk"
+name_ref = {
+    ref_a:"Germany",
+    ref_b:"France",
+    ref_c:"United Kingdom"}
+# + [markdown] tags=[]
+# For visual purposes, for some countries, we want to filter continuous areas, e.g. mainland for the US, or mainland FR.
+# + tags=[]
+if ref_b == "us":
+    geom2 = geom2.explode(index_parts=False).iloc[0:1].dissolve(by='name')
+# + tags=[]
+if ref_b == "fr":
+    geom2 = geom2.explode(index_parts=False).iloc[1:].dissolve(by='name')
+# + tags=[]
+sel_colors=["#6d904f", "#008fd5", "#fc4f30"]
+# + tags=[]
+fig, ax = plt.subplots(1, 3)
+for ix, subplot in enumerate([geom1, geom2, geom3]):
+    subplot.plot(ax=ax[ix], color=sel_colors[ix])
+    ax[ix].set_axis_off() 
+# -
+# ### Select coordinates
+#
+# Intersect hll coordinates with country geometry. Since hll coordinates are pre-aggregated (50 km), direct intersection will yield some error rate (in this case, called [MAUP](https://en.wikipedia.org/wiki/Modifiable_areal_unit_problem)).
+#
+# + tags=[]
+from geopandas.tools import sjoin
+gdf_intersect = sjoin(
+        gdf, geom1, how='right')
+# + tags=[]
+hll_series = {
+    }
+# -
+# Union all HLL sets for selected geometry.
+# + tags=[]
+hll_series[ref_a] = union_hll(gdf_intersect["user_hll"])
+# + [markdown] tags=[]
+# This is the unioned HLL set for all unique users on Flickr who shared at least one geotagged photo from Germany.
+# + tags=[]
+hll_series[ref_a]
+# -
+# .. an estimated number of:
+# + tags=[]
+cardinality_hll(hll_series[ref_a])
-# +
-import os
-from pathlib import Path
-from dotenv import load_dotenv
-dotenv_path = Path.cwd().parent / '.env'
-load_dotenv(dotenv_path, override=True)
-API_KEY = os.getenv("OPENAI_API_KEY")
-ORGANISATION = os.getenv("OPENAI_ORGANIZATION")
 # -
-openai.organization = ORGANISATION
+# Vice versa, we can "update" select original coordinates that intersect with selected geometries by using the index:
-openai.api_key = API_KEY
+# + tags=[]
+def get_selected_coords(gdf_intersect, gdf) -> gp.GeoDataFrame:
+    """Return selected coordinates based on index"""
+    gdf_intersect.set_index(
+        ["index_left0", "index_left1"],
+        inplace=True)
+    return gdf.loc[gdf_intersect.index]
-# Test
 # + tags=[]
-openai.Engine.list()
+coords_ref_a = get_selected_coords(gdf_intersect, gdf)
 # -
-len(paper_content)
+# Plot preview
+# + tags=[]
+minx, miny, maxx, maxy = gdf_intersect.total_bounds
+# + tags=[]
+fig, ax = plt.subplots(1, 1)
+ax.set_xlim(minx, maxx)
+ax.set_ylim(miny, maxy)
+base = world.plot(
+    ax=ax, color='none', edgecolor='black', linewidth=0.5)
+layers = coords_ref_a.plot(ax=base, markersize=5)
+layers.set_axis_off()
+# + [markdown] tags=[]
+# Repeat for the other two countries
-# This is based on the [OpenAI Example](https://beta.openai.com/examples/default-summarize) "Summarize Text". Also see the [API Reference](https://beta.openai.com/docs/api-reference/files).
+# + tags=[]
+gdf_intersect = sjoin(
+        gdf, geom2, how='right')
+hll_series[ref_b] = union_hll(gdf_intersect["user_hll"])
+coords_ref_b = get_selected_coords(gdf_intersect, gdf)
-def pdf_summary(paper_content, page_start=0, page_end=5):
+# + tags=[]
-    engine_list = openai.Engine.list()
+gdf_intersect = sjoin(
-    text = ""
+        gdf, geom3, how='right')
-    for ix, page in enumerate(paper_content[page_start:page_end]):
+hll_series[ref_c] = union_hll(gdf_intersect["user_hll"])
-        text = f"{text}\n{page.extract_text(x_tolerance=1)}"
+coords_ref_c = get_selected_coords(gdf_intersect, gdf)
-    task = \
+# -
-        f"This is part of a research paper (pages {page_start} to {page_end}). " \
-        f"Please summarize the text for a PhD student: \n" \
-        f"{text}\nthe_end"
-    response = openai.Completion.create(
-        engine="text-davinci-003",
-        prompt=task,
-        temperature=0.3,
-        max_tokens=256,
-        top_p=1,
-        frequency_penalty=0,
-        presence_penalty=0,
-        stop=["\nthe_end"]
-    )
-    return response
+# Return distinct errors with error margin (±2.3%)
-# - **max_tokens**: The maximum number of tokens to generate in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length. Most models have a context length of 2048 tokens (except for the newest models, which support 4096).
+# + tags=[]
-# - **temperature**: What sampling temperature to use. Higher values means the model will take more risks. Try 0.9 for more creative applications, and 0 (argmax sampling) for ones with a well-defined answer.
+for ref, hll_set in hll_series.items():
-# - **top_p**: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. We generally recommend altering this or temperature but not both.
+    cardinality = cardinality_hll(hll_set)
-# - **presence_penalty**: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    print(f'Estimated {cardinality} (±{int(cardinality/100*2.3)}) users for {name_ref.get(ref)}')
-# - **frequency_penalty**: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+# -
-answer = pdf_summary(paper_content, page_start=0, page_end=3)
+# ### Plot preview
-print(answer["choices"][0]["text"])
+# Get the total bounds:
-answer = pdf_summary(paper_content, page_start=4, page_end=6)
+# + tags=[]
+minx, miny, maxx, maxy = pd.concat(
+    [coords_ref_a["geometry"], coords_ref_b["geometry"], coords_ref_c["geometry"]]).total_bounds
-print(answer["choices"][0]["text"])
+# + tags=[]
+from typing import List, Optional
+def plot_map(
+    gdf: gp.GeoDataFrame, sel_coords: List[gp.GeoDataFrame],
+    sel_colors: List[str], buf = 100000,
+    title: Optional[str] = None, ax = None):
+    """Plot GeoDataFrame with matplotlib backend, optionaly export as png"""
+    if not ax:
+        fig, ax = plt.subplots(1, 1, figsize=(5, 10))
+    ax.set_xlim(minx-buf, maxx+buf)
+    ax.set_ylim(miny-buf, maxy+buf)
+    if title:
+        ax.set_title(title, fontsize=12)
+    for ix, sel_coord in enumerate(sel_coords):
+        sel_coord.plot(
+            ax=ax,
+            color=sel_colors[ix],
+            alpha=1.0,
+            markersize=10)
+    gdf.plot(
+        ax=ax, 
+        alpha=0.2,
+        markersize=0.1)
+    # combine with world geometry
+    world.plot(
+        ax=ax, color='none', edgecolor='black', linewidth=0.3)
+    # ax.imshow(fig, interpolation='nearest', cmap='gray')
+    # turn axis off
+    ax.set_axis_off()
-answer = pdf_summary(paper_content, page_start=6, page_end=8)
-print(answer["choices"][0]["text"])
+# + tags=[]
+sel_coords=[coords_ref_a, coords_ref_b, coords_ref_c]
+plot_map(
+    gdf=gdf, sel_coords=sel_coords, 
+    sel_colors=sel_colors,
+    title=f'Grid selection for {ref_a.upper()}, {ref_b.upper()} and {ref_c.upper()}')
+# -
-answer = pdf_summary(paper_content, page_start=8, page_end=10)
+# ### Calculate hll intersection (common visitors)
-print(answer["choices"][0]["text"])
+# According to the [Union-intersection-principle](https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle):
+#
+# $|A \cup B| = |A| + |B| - |A \cap B|$
+#
+# which can also be written as:
+#
+# $|A \cap B| = |A| + |B| - |A \cup B|$
+#
+# Therefore, unions can be used to calculate intersection. Calculate $|DE \cup US|$, $|DE \cup CA|$ and $|US \cup CA|$, i.e.:
+# ```python
+# IntersectionCount = 
+# cardinality_hll(de)::int +
+# cardinality_hll(us)::int - 
+# cardinality_hll(hll_union(de, us)
+# ```
-answer = pdf_summary(paper_content, page_start=10, page_end=12)
+# + tags=[]
+distinct_users_total = {}
+distinct_users_total[ref_a] = cardinality_hll(hll_series[ref_a])
+distinct_users_total[ref_b] = cardinality_hll(hll_series[ref_b])
+distinct_users_total[ref_c] = cardinality_hll(hll_series[ref_c])
+distinct_users_total
-print(answer["choices"][0]["text"])
+# + tags=[]
+union_p1 = (hll_series[ref_a], hll_series[ref_b])
+union_p2 = (hll_series[ref_a], hll_series[ref_c])
+union_p3 = (hll_series[ref_b], hll_series[ref_c])
+# -
-answer = pdf_summary(paper_content, page_start=12, page_end=15)
+# First, prepare combination for different sets.
-print(answer["choices"][0]["text"])
+# + tags=[]
+hll_sel = {
+    f"{ref_a}-{ref_b}": union_p1,
+    f"{ref_a}-{ref_c}": union_p2,
+    f"{ref_b}-{ref_c}": union_p3
+}
+distinct_common = {}
+for int_tuple, hll_sets in hll_sel.items():
+    hll_set = union_hll(
+        [hll_sets[0], hll_sets[1]])
+    distinct_common[int_tuple] = cardinality_hll(hll_set)
+    print(
+        f"{distinct_common[int_tuple]} distinct total users "
+        f"who shared Flickr photos from either {name_ref.get(int_tuple.split('-')[0])} "
+        f"or {name_ref.get(int_tuple.split('-')[1])} (union)")
+# -
-answer = pdf_summary(paper_content, page_start=15, page_end=17)
+# Calculate intersection
-print(answer["choices"][0]["text"])
+# + tags=[]
+distinct_intersection = {}
+for a, b in [(ref_a, ref_b), (ref_a, ref_c), (ref_b, ref_c)]:
+    a_total = distinct_users_total.get(a)
+    b_total = distinct_users_total.get(b)
+    common_ref = f'{a}-{b}'
+    intersection_count = a_total + b_total - distinct_common.get(common_ref)
+    distinct_intersection[common_ref] = intersection_count
+    print(
+        f"{distinct_intersection[common_ref]} distinct users "
+        f"who shared Flickr photos from {name_ref.get(a)} and {name_ref.get(b)} (intersection)")
+# -
-# ## Conclusions
+# Finally, lets get the number of users who have shared pictures from all three countries, based on the [formula for three sets](https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle):
+#
+# $|A \cup B \cup C| = |A| + |B| + |C| - |A \cap B| - |A \cap C| - |B \cap C| + |A \cap B \cap C|$
+#
+# which can also be written as:
+#
+# $|A \cap B \cap C| = |A \cup B \cup C| - |A| - |B| - |C| + |A \cap B| + |A \cap C| + |B \cap C|$
-# ChatGPT is best with responses > 256 tokens, but it is limited to processing 4096 tokens at once, which is not enough to read the full paper in context. Still, this seems like a good way to get a quick summary when skimming through many research papers.
+# + tags=[]
+union_count_all = cardinality_hll(union_hll(
+    [hll_series[ref_a], hll_series[ref_b], hll_series[ref_c]]))
+union_count_all
+# + tags=[]
+intersection_count_all = union_count_all - \
+    distinct_users_total[ref_a] - \
+    distinct_users_total[ref_b] - \
+    distinct_users_total[ref_c] + \
+    distinct_intersection[f'{ref_a}-{ref_b}'] + \
+    distinct_intersection[f'{ref_a}-{ref_c}'] + \
+    distinct_intersection[f'{ref_b}-{ref_c}']
+print(intersection_count_all)
+# -
+# Since negative visitor counts are impossible, the increasing inaccuracies for nested intersections are easily observable here. In other words, there is simply too little overlap between the visitors between all three countries to be measurable with HLL intersections
+# ## Visualize as Venn diagram
+# Since we're going to visualize this with [matplotlib-venn](https://github.com/konstantint/matplotlib-venn), we need the following variables:
+# + tags=[]
+from matplotlib_venn import venn3, venn3_circles
+plt.figure(figsize=(3,3))
+v = venn3(
+    subsets=(
+        500,
+        500, 
+        100,
+        500,
+        100,
+        100,
+        10),
+    set_labels = ('A', 'B', 'C'))
+v.get_label_by_id('100').set_text('Abc')
+v.get_label_by_id('010').set_text('aBc')
+v.get_label_by_id('001').set_text('abC')
+v.get_label_by_id('110').set_text('ABc')
+v.get_label_by_id('101').set_text('AbC')
+v.get_label_by_id('011').set_text('aBC')
+v.get_label_by_id('111').set_text('ABC')
+plt.show()
+# -
+# We already have `ABC`, the other values can be calculated:
+# + tags=[]
+ABC = intersection_count_all
+ABc = distinct_intersection[f'{ref_a}-{ref_b}'] - ABC
+aBC = distinct_intersection[f'{ref_b}-{ref_c}'] - ABC
+AbC = distinct_intersection[f'{ref_a}-{ref_c}'] - ABC
+Abc = distinct_users_total[ref_a] - ABc - AbC + ABC
+aBc = distinct_users_total[ref_b] - ABc - aBC + ABC
+abC = distinct_users_total[ref_c] - aBC - AbC + ABC
+# -
+# Define Function to plot Venn Diagram.
+# +
+from typing import Tuple
+def plot_venn(
+    subset_sizes: List[int],
+    colors: List[str], 
+    names: List[str],
+    subset_sizes_raw: List[int] = None,
+    ax = None,
+    title: str = None):
+    """Plot Venn Diagram"""
+    if not ax:
+        fig, ax = plt.subplots(1, 1, figsize=(5,5))
+    set_labels = (
+        'A', 'B', 'C')
+    v = venn3(
+        subsets=(
+            [subset_size for subset_size in subset_sizes]),
+        set_labels = set_labels,
+        ax=ax)    
+    for ix, idx in enumerate(
+        ['100', '010', '001']):
+        v.get_patch_by_id(
+            idx).set_color(colors[ix])
+        v.get_patch_by_id(
+            idx).set_alpha(0.8)
+        v.get_label_by_id(
+            set_labels[ix]).set_text(
+            names[ix])
+    if subset_sizes_raw:
+        for ix, idx in enumerate(
+            ['100', '010', None, '001']):
+            if not idx:
+                continue
+            dif_abs = subset_sizes[ix] - subset_sizes_raw[ix]
+            dif_perc = dif_abs / (subset_sizes_raw[ix] / 100)
+            v.get_label_by_id(idx).set_text(
+                f'{subset_sizes[ix]}\n{dif_perc:+.1f}%')            
+    label_ids = [
+        '100', '010', '001',
+        '110', '101', '011',
+        '111', 'A', 'B', 'C']
+    for label_id in label_ids:
+        v.get_label_by_id(
+            label_id).set_fontsize(14)
+    # draw borders
+    c = venn3_circles(
+        subsets=(
+            [subset_size for subset_size in subset_sizes]),
+        linestyle='dashed',
+        lw=1,
+        ax=ax)
+    if title:
+        ax.title.set_text(title)
+# -
+# Plot Venn Diagram:
+# + tags=[]
+subset_sizes = [
+    Abc, aBc, ABc, abC, AbC, aBC, ABC]
+names = [
+    name_ref.get(ref_a), name_ref.get(ref_b), name_ref.get(ref_c)]
+plot_venn(
+    subset_sizes=subset_sizes,
+    colors=sel_colors,
+    names=names,
+    title="Common User Count")
+# -
+# ### Combine Map & Venn Diagram
+#
+# A figure with two subplots (1 row, 2 columns).
+# + tags=[]
+fig, ax = plt.subplots(1, 2, figsize=(10, 15), width_ratios=[1, 1])
+plot_map(
+    gdf=gdf, sel_coords=sel_coords, 
+    sel_colors=sel_colors, ax=ax[0])
+plot_venn(
+    subset_sizes=subset_sizes,
+    colors=sel_colors,
+    names=names,
+    ax=ax[1])
+# store as png
+fig.savefig(
+    Path.cwd().parents[0] / "resources" / f"hll_intersection_{ref_a}{ref_b}{ref_c}.png", dpi=300, format='PNG',
+    bbox_inches='tight', pad_inches=0)
+# -
+# Each dot on the left side is a coordinate shared in the original dataset, with an attached HLL set abstracting all users having shared photographs from this part of the world. By union of all HLL sets for each of the three country, based on coordinate-country inersection, we were able to calculate the intersection between different sets. These estimated common visitor counts are labeled on the right side in the Venn diagram. 
+#
+# The number of distinct users who shared photos from the US by far outweights Germany and Canada. Not surprisingly, there is a larger overlap between common visitors for Canada and USA. By union of HLL sets, we are able to estimate distinct visitor counts for arbitrary areas or regions. This ability to re-use HLL sets is, however, limited by the lower limit of 50km resolution of the shared benchmark dataset.
 # ## Create notebook HTML
+# + tags=[]
 # !jupyter nbconvert --to html_toc \
 #     --output-dir=../resources/html/ ./intro.ipynb \
 #     --template=../nbconvert.tpl \
 #     --ExtractOutputPreprocessor.enabled=False >&- 2>&-
+# -