Skip to content
Snippets Groups Projects
Unverified Commit 6a30f5ca authored by Alexander Dunkel's avatar Alexander Dunkel
Browse files

Update auto converted files to latest

parent 3a8139ab
No related branches found
No related tags found
No related merge requests found
Pipeline #90442 passed
This diff is collapsed.
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
# <div style="width:256px;text-align:right;margin-top:0px;margin-right:10px"><a href="https://gitlab.hrz.tu-chemnitz.de/tud-ifk/sunsetsunrise-example-benchmarkdata"><img src="https://kartographie.geo.tu-dresden.de/ad/sunsetsunrise-demo/version.svg"></a></div> # <div style="width:256px;text-align:right;margin-top:0px;margin-right:10px"><a href="https://gitlab.hrz.tu-chemnitz.de/tud-ifk/sunsetsunrise-example-benchmarkdata"><img src="https://kartographie.geo.tu-dresden.de/ad/sunsetsunrise-demo/version.svg"></a></div>
# </div> # </div>
# + tags=["hide_code"] # + tags=["hide_code"] jupyter={"source_hidden": true}
from IPython.display import Markdown as md from IPython.display import Markdown as md
from datetime import date from datetime import date
...@@ -29,7 +29,7 @@ with open('/.version', 'r') as file: app_version = file.read().split("'")[1] ...@@ -29,7 +29,7 @@ with open('/.version', 'r') as file: app_version = file.read().split("'")[1]
md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitlab.vgiscience.de/lbsn/tools/jupyterlab) Version {app_version}") md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitlab.vgiscience.de/lbsn/tools/jupyterlab) Version {app_version}")
# - # -
# This notebook shows how to load and use the HyperLogLog benchmark data shared in # This notebook demonstrates how to load and use the HyperLogLog benchmark data shared in
# #
# > Dunkel, A., Hartmann, M. C., Hauthal, E., Burghardt, Dirk, & Purves, R. S. (2023). # > Dunkel, A., Hartmann, M. C., Hauthal, E., Burghardt, Dirk, & Purves, R. S. (2023).
# From sunrise to sunset: Exploring landscape preference through global reactions to # From sunrise to sunset: Exploring landscape preference through global reactions to
...@@ -38,10 +38,10 @@ md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitl ...@@ -38,10 +38,10 @@ md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitl
# The data is available in a [repository](https://doi.org/10.25532/OPARA-200) # The data is available in a [repository](https://doi.org/10.25532/OPARA-200)
# #
# <div class="alert alert-success" role="alert" style="color: black;"> # <div class="alert alert-success" role="alert" style="color: black;">
# <details><summary style="cursor: pointer;"><strong>If you are unfamiliar with HLL</strong></summary> # <details><summary style="cursor: pointer;"><strong>Unfamiliar with HLL?</strong></summary>
# <ul> # <ul>
# <li>A basic introduction to working with HLL data is provided <a href="https://kartographie.geo.tu-dresden.de/python_datascience_course/02_hll_intro.html">in this jupyter notebook</a></li> # <li>A basic introduction to working with HLL data is provided <a href="https://kartographie.geo.tu-dresden.de/python_datascience_course/02_hll_intro.html">in this jupyter notebook</a></li>
# <li>The scoping study for the sunset-sunrise paper was based on the Flickr YFCC 100M dataset, and the corresponding Notebooks can be found <a href="https://gitlab.vgiscience.de/ad/yfcc_gridagg">here</a></li> # <li>The scoping study for the sunset-sunrise paper was based on the Flickr YFCC 100M dataset, and the corresponding notebooks can be found <a href="https://gitlab.vgiscience.de/ad/yfcc_gridagg">here</a></li>
# <li>The notebooks for the sunset-sunrise article can be found <a href="https://gitlab.vgiscience.de/ad/sunset-sunrise-paper">here</a></li> # <li>The notebooks for the sunset-sunrise article can be found <a href="https://gitlab.vgiscience.de/ad/sunset-sunrise-paper">here</a></li>
# </ul> # </ul>
# </details> # </details>
...@@ -67,7 +67,9 @@ md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitl ...@@ -67,7 +67,9 @@ md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitl
# # cd jupyterlab # # cd jupyterlab
# # cp .env.example .env # # cp .env.example .env
# nano .env # nano .env
# ## Enter:
# # JUPYTER_NOTEBOOKS=~/notebooks/sunsetsunrise-demo # # JUPYTER_NOTEBOOKS=~/notebooks/sunsetsunrise-demo
# # TAG=v0.12.3
# docker network create lbsn-network # docker network create lbsn-network
# docker-compose pull && docker-compose up -d # docker-compose pull && docker-compose up -d
# ``` # ```
...@@ -89,7 +91,7 @@ md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitl ...@@ -89,7 +91,7 @@ md(f"Last updated: {today.strftime('%b-%d-%Y')}, [Carto-Lab Docker](https://gitl
# </details> # </details>
# </div> # </div>
# + tags=["hide_code"] # + tags=["hide_code"] jupyter={"source_hidden": true}
import sys import sys
from pathlib import Path from pathlib import Path
...@@ -108,40 +110,29 @@ tools.package_report(root_packages) ...@@ -108,40 +110,29 @@ tools.package_report(root_packages)
# Load dependencies: # Load dependencies:
# + tags=[]
import os import os
import csv
import sys
import colorcet
import psycopg2 # Postgres API import psycopg2 # Postgres API
import mapclassify as mc
import geopandas as gp import geopandas as gp
import pandas as pd import pandas as pd
import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import geoviews.feature as gf
from collections import namedtuple
from typing import List, Tuple, Dict, Optional from typing import List, Tuple, Dict, Optional
from pyproj import Transformer, CRS, Proj from pyproj import Transformer, CRS, Proj
from shapely.geometry import shape, Point, Polygon
from shapely.ops import transform
from cartopy import crs
from matplotlib import colors
from IPython.display import clear_output, display, HTML from IPython.display import clear_output, display, HTML
# optionally, enable shapely.speedups # -
# which makes some of the spatial
# queries running faster
import shapely.speedups as speedups
from shapely import geometry
# Activate autoreload of changed python files: # Activate autoreload of changed python files:
# + tags=[]
# %load_ext autoreload # %load_ext autoreload
# %autoreload 2 # %autoreload 2
# -
# ## Parameters # ## Parameters
# #
# Define initial parameters that affect processing # Define initial parameters that affect processing
# + tags=[]
WORK_DIR = Path.cwd().parents[0] / "tmp" WORK_DIR = Path.cwd().parents[0] / "tmp"
"""Working diretcory""" """Working diretcory"""
EPSG_CODE = 54009 EPSG_CODE = 54009
...@@ -154,125 +145,774 @@ CRS_WGS = "epsg:4326" ...@@ -154,125 +145,774 @@ CRS_WGS = "epsg:4326"
OUTPUT = Path.cwd().parents[0] / "out" OUTPUT = Path.cwd().parents[0] / "out"
"""Define path to output directory (figures etc.)"""; """Define path to output directory (figures etc.)""";
# + tags=[]
WORK_DIR.mkdir(exist_ok=True) WORK_DIR.mkdir(exist_ok=True)
# -
# ## Load benchmark data # ## Load benchmark data
# + tags=[]
source_zip="https://opara.zih.tu-dresden.de/xmlui/bitstream/handle/123456789/5793/S10.zip?sequence=1&isAllowed=y" source_zip="https://opara.zih.tu-dresden.de/xmlui/bitstream/handle/123456789/5793/S10.zip?sequence=1&isAllowed=y"
# + tags=[]
if not (WORK_DIR / "flickr-all.csv").exists(): if not (WORK_DIR / "flickr-all.csv").exists():
tools.get_zip_extract(uri=source_zip, filename="S10.zip", output_path=WORK_DIR) tools.get_zip_extract(uri=source_zip, filename="S10.zip", output_path=WORK_DIR)
# -
# Show the directory tree: # Show the directory tree:
# + tags=[]
tools.tree(Path.cwd().parents[0]) tools.tree(Path.cwd().parents[0])
# -
# ## Load data # ## Load data
# + tags=[] # + tags=[]
ALL_FLICKR = WORK_DIR / "flickr_all_hll.csv" ALL_FLICKR = WORK_DIR / "flickr_all_hll.csv"
# -
# + tags=[]
dtypes = {'latitude': float, 'longitude': float} dtypes = {'latitude': float, 'longitude': float}
# + tags=[]
df = pd.read_csv(ALL_FLICKR, dtype=dtypes, encoding='utf-8') df = pd.read_csv(ALL_FLICKR, dtype=dtypes, encoding='utf-8')
# + tags=[]
df.head() df.head()
# -
# ## Use GPT3 to summarize content # ## Test HLL
# Load API key # First, let's test the slower Python implemention [python-hll](https://github.com/AdRoll/python-hll), to get a cardinality for some of the original HLL strings. Cardinality means the estimated count of distinct items in a set, which refers to the User Count in our case.
# + tags=[]
from python_hll.util import NumberUtil
from python_hll.hll import HLL
# + tags=[]
def hll_from_byte(hll_set: str):
"""Return HLL set from binary representation"""
hex_string = hll_set[2:]
return HLL.from_bytes(
NumberUtil.from_hex(
hex_string, 0, len(hex_string)))
# -
# Have a look at the first HLL set:
# + tags=[]
df["user_hll"][0]
# -
# Cast to HLL and calculate cardinality in one step:
# + tags=[]
hll_from_byte(df["user_hll"][0]).cardinality() - 1
# -
# An estimated number of 12 users has been observed at location `0.040050,-179.750586` (lat, lng).
#
# These latitude and longitude coordinates refer to the centroids of the original 50 km grid.
# ## Test Postgres HLL
# In order to speed up processing, we can connect to a Postgres Database running the faster postgresql-hll extension from citus.
#
# Password and username for connecting to local [hllworker](https://gitlab.vgiscience.de/lbsn/databases/pg-hll-empty) are loaded from environment.
# + tags=[]
DB_USER = "hlluser"
DB_PASS = os.getenv('READONLY_USER_PASSWORD')
# set connection variables
DB_HOST = "hllworkerdb"
DB_PORT = "5432"
DB_NAME = "hllworkerdb"
# -
# Connect to empty Postgres database running HLL Extension:
# + tags=[]
DB_CONN = psycopg2.connect(
host=DB_HOST,
port=DB_PORT ,
dbname=DB_NAME,
user=DB_USER,
password=DB_PASS
)
DB_CONN.set_session(
readonly=True)
DB_CALC = tools.DbConn(
DB_CONN)
CUR_HLL = DB_CONN.cursor()
# -
# Test connection:
# + tags=[]
CUR_HLL.execute("SELECT 1;")
print(CUR_HLL.statusmessage)
# -
# Test with actuall HLL set calculation:
# + tags=[]
db_query = f"""
SELECT lat, lng, hll_cardinality(user_hll)::int as usercount
FROM (VALUES
('{df["latitude"][0]}', '{df["longitude"][0]}', '{df["user_hll"][0]}'::hll)
) s(lat, lng, user_hll)
"""
# + tags=[]
df2 = DB_CALC.query(db_query)
# + tags=[]
df2.head()
# -
# Store this as two methods, to be used later.
# + tags=[]
def union_hll(hll_sets: List[str], cur_hll = CUR_HLL) -> str:
"Union two or more HLL sets and return the combined HLL set"
hll_values = ", ".join(f"('{hll_set}'::hll)" for hll_set in hll_sets)
db_query = f"""
SELECT hll_union_agg(s.hll_set)
FROM (
VALUES
{hll_values}
) s(hll_set);
"""
CUR_HLL.execute(db_query)
# return results as first from tuple
return CUR_HLL.fetchone()[0]
def cardinality_hll(hll_set) -> int:
"Calculate the cardinality of a HLL set and return as int"
CUR_HLL.execute(f"SELECT hll_cardinality('{hll_set}')::int;")
return int(CUR_HLL.fetchone()[0])
# + tags=[]
results = union_hll([df["user_hll"][0], df["user_hll"][1]])
results
# + tags=[]
cardinality_hll(results)
# -
# ## Summarize distinct user per continent
# Now that we verified individual HLLs, we will
#
# 1. load the full list of HLL sets for all coordinates
# 2. assign coordinates to continents
# 3. union all hll sets per continent, to retrieve the combined number of estimated distinct users having visited each continent on Flickr
# ### Load continent geometries
# + tags=[]
world = gp.read_file(
gp.datasets.get_path('naturalearth_lowres'),
crs=CRS_WGS)
world = world.to_crs(CRS_PROJ)
# + tags=[]
world = world[['continent', 'geometry']]
# + tags=[]
ax = world.plot(column='continent')
ax.set_axis_off()
# -
# There are slight inaccuracies in Continent Geometries (overlapping polygons), which can be fixed by a small buffer, before dissolving country geometries:
# + tags=[]
world['geometry'] = world.buffer(0.01)
# + tags=[]
continents = world.dissolve(by='continent')
# + tags=[]
continents.head()
# -
# Assign index back as a column, to classify via color by `column='continent'`
# + tags=[]
continents['continent'] = continents.index.get_level_values(0)
# + tags=[]
ax = continents.plot(column='continent')
ax.set_axis_off()
# -
# ### Create a GeoDataFrame from HLL coordinates
# + tags=[]
gdf = gp.GeoDataFrame(
df, geometry=gp.points_from_xy(df.longitude, df.latitude))
gdf.crs=CRS_WGS
gdf = gdf.to_crs(CRS_PROJ)
# -
# Define pyproj Transformer ahead of time with xy-order of coordinates, for future projections between WGS1984 and EPSG 54009 (Mollweide)
# + tags=[]
PROJ_TRANSFORMER = Transformer.from_crs(
CRS_WGS, CRS_PROJ, always_xy=True)
# also define reverse projection
PROJ_TRANSFORMER_BACK = Transformer.from_crs(
CRS_PROJ, CRS_WGS, always_xy=True)
# + tags=[]
gdf.set_index(['latitude', 'longitude'], inplace=True)
# + tags=[]
gdf.head()
# -
# ### Calculate Cardinality
# Define a preview area (Italy)
# + tags=[]
bbox_eu = (
6.8662109375, 35.24427318493909,
22.31396484375, 45.29320031385282)
# create bounds from WGS1984 italy and project to Mollweide
minx, miny = PROJ_TRANSFORMER.transform(
bbox_eu[0], bbox_eu[1])
maxx, maxy = PROJ_TRANSFORMER.transform(
bbox_eu[2], bbox_eu[3])
# + tags=[]
gdf_italy = gdf.cx[minx:maxx, miny:maxy]
# + tags=[]
# %%time
cardinality_series = tools.hll_series_cardinality(gdf_italy["user_hll"], db_conn=DB_CALC)
# + tags=[]
cardinality_series.head()
# + tags=[]
gdf.loc[
cardinality_series.index,
"usercount"] = cardinality_series
# -
# Preview HLL coordinates and cardinality:
# + tags=[]
fig, ax = plt.subplots(1, 1)
ax.set_xlim(minx, maxx)
ax.set_ylim(miny, maxy)
plot_kwd = {
'column':'usercount',
'legend':True,
'markersize':25,
'cmap':'Reds',
'scheme':'quantiles',
'legend_kwds':{'bbox_to_anchor': (1, 1), 'frameon':False, 'markerscale':0.7}
}
base = gdf.cx[minx:maxx, miny:maxy].plot(
ax=ax, **plot_kwd)
world.plot(
ax=base, color='none', edgecolor='black', linewidth=0.5)
ax.set_axis_off()
# -
# ### Union HLL sets per continent
# The last step is to union HLL sets per continent and calculate the combined cardinality.
# + tags=[]
# %%time
gdf_overlay = gp.overlay(
gdf, world,
how='intersection')
gdf_overlay.set_index("continent", inplace=True)
# + tags=[]
gdf_overlay.head()
# + tags=[]
# %%time
cardinality_series = tools.union_hll_series(
gdf_overlay["user_hll"], db_conn=DB_CALC, cardinality=True)
# -
# Below we can see the combined count distinct of users on Flickr per continent.
# + tags=[]
cardinality_series.sort_values(ascending=False).head(10)
# -
# Assign cardinality back to continents Geodataframe
# + tags=[]
continents.loc[
cardinality_series.index,
"usercount"] = cardinality_series.values
# + tags=[]
plot_kwd['legend_kwds'] = {'bbox_to_anchor': (1.4, 1), 'frameon':False}
ax = continents.plot(**plot_kwd)
ax.set_axis_off()
# -
# ## Intersection example
# It is also possible to intersect HLL sets, to some degree. This can be used to estimate common visitor counts for (e.g.) countries. The average error, however, will be larger, especially for intersections of HLL sets for different sizes.
# + tags=[]
world = gp.read_file(
gp.datasets.get_path('naturalearth_lowres'),
crs=CRS_WGS)
world = world.to_crs(
CRS_PROJ)
# -
# Select geometry for DE, FR and UK
# + tags=[]
geom1 = world[world['name'] == "Germany"]
geom2 = world[world['name'] == "France"]
geom3 = world[world['name'] == "United Kingdom"]
# -
# <div class="alert alert-warning" role="alert" style="color: black;">
# <details><summary style="cursor: pointer;"><strong>Select different countries</strong></summary>
# <ul>
# <li>Optionally: Modify the list of countries to adapt visualizations below.</li>
# </ul>
# Example:
# <pre lang="python">
# geom1 = world[world['name'] == "Germany"]
# geom2 = world[world['name'] == "United States of America"]
# geom3 = world[world['name'] == "Canada"]
# </pre>
# ----
# <pre lang="python">
# ref_a = "de"
# ref_b = "us"
# ref_c = "ca"
# name_ref = {
# ref_a:"Germany",
# ref_b:"USA",
# ref_c:"Canada"}
# </pre>
# </details>
# </div>
# + tags=[]
ref_a = "de"
ref_b = "fr"
ref_c = "uk"
name_ref = {
ref_a:"Germany",
ref_b:"France",
ref_c:"United Kingdom"}
# + [markdown] tags=[]
# For visual purposes, for some countries, we want to filter continuous areas, e.g. mainland for the US, or mainland FR.
# + tags=[]
if ref_b == "us":
geom2 = geom2.explode(index_parts=False).iloc[0:1].dissolve(by='name')
# + tags=[]
if ref_b == "fr":
geom2 = geom2.explode(index_parts=False).iloc[1:].dissolve(by='name')
# + tags=[]
sel_colors=["#6d904f", "#008fd5", "#fc4f30"]
# + tags=[]
fig, ax = plt.subplots(1, 3)
for ix, subplot in enumerate([geom1, geom2, geom3]):
subplot.plot(ax=ax[ix], color=sel_colors[ix])
ax[ix].set_axis_off()
# -
# ### Select coordinates
#
# Intersect hll coordinates with country geometry. Since hll coordinates are pre-aggregated (50 km), direct intersection will yield some error rate (in this case, called [MAUP](https://en.wikipedia.org/wiki/Modifiable_areal_unit_problem)).
#
# + tags=[]
from geopandas.tools import sjoin
gdf_intersect = sjoin(
gdf, geom1, how='right')
# + tags=[]
hll_series = {
}
# -
# Union all HLL sets for selected geometry.
# + tags=[]
hll_series[ref_a] = union_hll(gdf_intersect["user_hll"])
# + [markdown] tags=[]
# This is the unioned HLL set for all unique users on Flickr who shared at least one geotagged photo from Germany.
# + tags=[]
hll_series[ref_a]
# -
# .. an estimated number of:
# + tags=[]
cardinality_hll(hll_series[ref_a])
# +
import os
from pathlib import Path
from dotenv import load_dotenv
dotenv_path = Path.cwd().parent / '.env'
load_dotenv(dotenv_path, override=True)
API_KEY = os.getenv("OPENAI_API_KEY")
ORGANISATION = os.getenv("OPENAI_ORGANIZATION")
# - # -
openai.organization = ORGANISATION # Vice versa, we can "update" select original coordinates that intersect with selected geometries by using the index:
openai.api_key = API_KEY
# + tags=[]
def get_selected_coords(gdf_intersect, gdf) -> gp.GeoDataFrame:
"""Return selected coordinates based on index"""
gdf_intersect.set_index(
["index_left0", "index_left1"],
inplace=True)
return gdf.loc[gdf_intersect.index]
# Test
# + tags=[] # + tags=[]
openai.Engine.list() coords_ref_a = get_selected_coords(gdf_intersect, gdf)
# - # -
len(paper_content) # Plot preview
# + tags=[]
minx, miny, maxx, maxy = gdf_intersect.total_bounds
# + tags=[]
fig, ax = plt.subplots(1, 1)
ax.set_xlim(minx, maxx)
ax.set_ylim(miny, maxy)
base = world.plot(
ax=ax, color='none', edgecolor='black', linewidth=0.5)
layers = coords_ref_a.plot(ax=base, markersize=5)
layers.set_axis_off()
# + [markdown] tags=[]
# Repeat for the other two countries
# This is based on the [OpenAI Example](https://beta.openai.com/examples/default-summarize) "Summarize Text". Also see the [API Reference](https://beta.openai.com/docs/api-reference/files). # + tags=[]
gdf_intersect = sjoin(
gdf, geom2, how='right')
hll_series[ref_b] = union_hll(gdf_intersect["user_hll"])
coords_ref_b = get_selected_coords(gdf_intersect, gdf)
def pdf_summary(paper_content, page_start=0, page_end=5): # + tags=[]
engine_list = openai.Engine.list() gdf_intersect = sjoin(
text = "" gdf, geom3, how='right')
for ix, page in enumerate(paper_content[page_start:page_end]): hll_series[ref_c] = union_hll(gdf_intersect["user_hll"])
text = f"{text}\n{page.extract_text(x_tolerance=1)}" coords_ref_c = get_selected_coords(gdf_intersect, gdf)
task = \ # -
f"This is part of a research paper (pages {page_start} to {page_end}). " \
f"Please summarize the text for a PhD student: \n" \
f"{text}\nthe_end"
response = openai.Completion.create(
engine="text-davinci-003",
prompt=task,
temperature=0.3,
max_tokens=256,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
stop=["\nthe_end"]
)
return response
# Return distinct errors with error margin (±2.3%)
# - **max_tokens**: The maximum number of tokens to generate in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length. Most models have a context length of 2048 tokens (except for the newest models, which support 4096). # + tags=[]
# - **temperature**: What sampling temperature to use. Higher values means the model will take more risks. Try 0.9 for more creative applications, and 0 (argmax sampling) for ones with a well-defined answer. for ref, hll_set in hll_series.items():
# - **top_p**: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. We generally recommend altering this or temperature but not both. cardinality = cardinality_hll(hll_set)
# - **presence_penalty**: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. print(f'Estimated {cardinality}{int(cardinality/100*2.3)}) users for {name_ref.get(ref)}')
# - **frequency_penalty**: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. # -
answer = pdf_summary(paper_content, page_start=0, page_end=3) # ### Plot preview
print(answer["choices"][0]["text"]) # Get the total bounds:
answer = pdf_summary(paper_content, page_start=4, page_end=6) # + tags=[]
minx, miny, maxx, maxy = pd.concat(
[coords_ref_a["geometry"], coords_ref_b["geometry"], coords_ref_c["geometry"]]).total_bounds
print(answer["choices"][0]["text"]) # + tags=[]
from typing import List, Optional
def plot_map(
gdf: gp.GeoDataFrame, sel_coords: List[gp.GeoDataFrame],
sel_colors: List[str], buf = 100000,
title: Optional[str] = None, ax = None):
"""Plot GeoDataFrame with matplotlib backend, optionaly export as png"""
if not ax:
fig, ax = plt.subplots(1, 1, figsize=(5, 10))
ax.set_xlim(minx-buf, maxx+buf)
ax.set_ylim(miny-buf, maxy+buf)
if title:
ax.set_title(title, fontsize=12)
for ix, sel_coord in enumerate(sel_coords):
sel_coord.plot(
ax=ax,
color=sel_colors[ix],
alpha=1.0,
markersize=10)
gdf.plot(
ax=ax,
alpha=0.2,
markersize=0.1)
# combine with world geometry
world.plot(
ax=ax, color='none', edgecolor='black', linewidth=0.3)
# ax.imshow(fig, interpolation='nearest', cmap='gray')
# turn axis off
ax.set_axis_off()
answer = pdf_summary(paper_content, page_start=6, page_end=8)
print(answer["choices"][0]["text"]) # + tags=[]
sel_coords=[coords_ref_a, coords_ref_b, coords_ref_c]
plot_map(
gdf=gdf, sel_coords=sel_coords,
sel_colors=sel_colors,
title=f'Grid selection for {ref_a.upper()}, {ref_b.upper()} and {ref_c.upper()}')
# -
answer = pdf_summary(paper_content, page_start=8, page_end=10) # ### Calculate hll intersection (common visitors)
print(answer["choices"][0]["text"]) # According to the [Union-intersection-principle](https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle):
#
# $|A \cup B| = |A| + |B| - |A \cap B|$
#
# which can also be written as:
#
# $|A \cap B| = |A| + |B| - |A \cup B|$
#
# Therefore, unions can be used to calculate intersection. Calculate $|DE \cup US|$, $|DE \cup CA|$ and $|US \cup CA|$, i.e.:
# ```python
# IntersectionCount =
# cardinality_hll(de)::int +
# cardinality_hll(us)::int -
# cardinality_hll(hll_union(de, us)
# ```
answer = pdf_summary(paper_content, page_start=10, page_end=12) # + tags=[]
distinct_users_total = {}
distinct_users_total[ref_a] = cardinality_hll(hll_series[ref_a])
distinct_users_total[ref_b] = cardinality_hll(hll_series[ref_b])
distinct_users_total[ref_c] = cardinality_hll(hll_series[ref_c])
distinct_users_total
print(answer["choices"][0]["text"]) # + tags=[]
union_p1 = (hll_series[ref_a], hll_series[ref_b])
union_p2 = (hll_series[ref_a], hll_series[ref_c])
union_p3 = (hll_series[ref_b], hll_series[ref_c])
# -
answer = pdf_summary(paper_content, page_start=12, page_end=15) # First, prepare combination for different sets.
print(answer["choices"][0]["text"]) # + tags=[]
hll_sel = {
f"{ref_a}-{ref_b}": union_p1,
f"{ref_a}-{ref_c}": union_p2,
f"{ref_b}-{ref_c}": union_p3
}
distinct_common = {}
for int_tuple, hll_sets in hll_sel.items():
hll_set = union_hll(
[hll_sets[0], hll_sets[1]])
distinct_common[int_tuple] = cardinality_hll(hll_set)
print(
f"{distinct_common[int_tuple]} distinct total users "
f"who shared Flickr photos from either {name_ref.get(int_tuple.split('-')[0])} "
f"or {name_ref.get(int_tuple.split('-')[1])} (union)")
# -
answer = pdf_summary(paper_content, page_start=15, page_end=17) # Calculate intersection
print(answer["choices"][0]["text"]) # + tags=[]
distinct_intersection = {}
for a, b in [(ref_a, ref_b), (ref_a, ref_c), (ref_b, ref_c)]:
a_total = distinct_users_total.get(a)
b_total = distinct_users_total.get(b)
common_ref = f'{a}-{b}'
intersection_count = a_total + b_total - distinct_common.get(common_ref)
distinct_intersection[common_ref] = intersection_count
print(
f"{distinct_intersection[common_ref]} distinct users "
f"who shared Flickr photos from {name_ref.get(a)} and {name_ref.get(b)} (intersection)")
# -
# ## Conclusions # Finally, lets get the number of users who have shared pictures from all three countries, based on the [formula for three sets](https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle):
#
# $|A \cup B \cup C| = |A| + |B| + |C| - |A \cap B| - |A \cap C| - |B \cap C| + |A \cap B \cap C|$
#
# which can also be written as:
#
# $|A \cap B \cap C| = |A \cup B \cup C| - |A| - |B| - |C| + |A \cap B| + |A \cap C| + |B \cap C|$
# ChatGPT is best with responses > 256 tokens, but it is limited to processing 4096 tokens at once, which is not enough to read the full paper in context. Still, this seems like a good way to get a quick summary when skimming through many research papers. # + tags=[]
union_count_all = cardinality_hll(union_hll(
[hll_series[ref_a], hll_series[ref_b], hll_series[ref_c]]))
union_count_all
# + tags=[]
intersection_count_all = union_count_all - \
distinct_users_total[ref_a] - \
distinct_users_total[ref_b] - \
distinct_users_total[ref_c] + \
distinct_intersection[f'{ref_a}-{ref_b}'] + \
distinct_intersection[f'{ref_a}-{ref_c}'] + \
distinct_intersection[f'{ref_b}-{ref_c}']
print(intersection_count_all)
# -
# Since negative visitor counts are impossible, the increasing inaccuracies for nested intersections are easily observable here. In other words, there is simply too little overlap between the visitors between all three countries to be measurable with HLL intersections
# ## Visualize as Venn diagram
# Since we're going to visualize this with [matplotlib-venn](https://github.com/konstantint/matplotlib-venn), we need the following variables:
# + tags=[]
from matplotlib_venn import venn3, venn3_circles
plt.figure(figsize=(3,3))
v = venn3(
subsets=(
500,
500,
100,
500,
100,
100,
10),
set_labels = ('A', 'B', 'C'))
v.get_label_by_id('100').set_text('Abc')
v.get_label_by_id('010').set_text('aBc')
v.get_label_by_id('001').set_text('abC')
v.get_label_by_id('110').set_text('ABc')
v.get_label_by_id('101').set_text('AbC')
v.get_label_by_id('011').set_text('aBC')
v.get_label_by_id('111').set_text('ABC')
plt.show()
# -
# We already have `ABC`, the other values can be calculated:
# + tags=[]
ABC = intersection_count_all
ABc = distinct_intersection[f'{ref_a}-{ref_b}'] - ABC
aBC = distinct_intersection[f'{ref_b}-{ref_c}'] - ABC
AbC = distinct_intersection[f'{ref_a}-{ref_c}'] - ABC
Abc = distinct_users_total[ref_a] - ABc - AbC + ABC
aBc = distinct_users_total[ref_b] - ABc - aBC + ABC
abC = distinct_users_total[ref_c] - aBC - AbC + ABC
# -
# Define Function to plot Venn Diagram.
# +
from typing import Tuple
def plot_venn(
subset_sizes: List[int],
colors: List[str],
names: List[str],
subset_sizes_raw: List[int] = None,
ax = None,
title: str = None):
"""Plot Venn Diagram"""
if not ax:
fig, ax = plt.subplots(1, 1, figsize=(5,5))
set_labels = (
'A', 'B', 'C')
v = venn3(
subsets=(
[subset_size for subset_size in subset_sizes]),
set_labels = set_labels,
ax=ax)
for ix, idx in enumerate(
['100', '010', '001']):
v.get_patch_by_id(
idx).set_color(colors[ix])
v.get_patch_by_id(
idx).set_alpha(0.8)
v.get_label_by_id(
set_labels[ix]).set_text(
names[ix])
if subset_sizes_raw:
for ix, idx in enumerate(
['100', '010', None, '001']):
if not idx:
continue
dif_abs = subset_sizes[ix] - subset_sizes_raw[ix]
dif_perc = dif_abs / (subset_sizes_raw[ix] / 100)
v.get_label_by_id(idx).set_text(
f'{subset_sizes[ix]}\n{dif_perc:+.1f}%')
label_ids = [
'100', '010', '001',
'110', '101', '011',
'111', 'A', 'B', 'C']
for label_id in label_ids:
v.get_label_by_id(
label_id).set_fontsize(14)
# draw borders
c = venn3_circles(
subsets=(
[subset_size for subset_size in subset_sizes]),
linestyle='dashed',
lw=1,
ax=ax)
if title:
ax.title.set_text(title)
# -
# Plot Venn Diagram:
# + tags=[]
subset_sizes = [
Abc, aBc, ABc, abC, AbC, aBC, ABC]
names = [
name_ref.get(ref_a), name_ref.get(ref_b), name_ref.get(ref_c)]
plot_venn(
subset_sizes=subset_sizes,
colors=sel_colors,
names=names,
title="Common User Count")
# -
# ### Combine Map & Venn Diagram
#
# A figure with two subplots (1 row, 2 columns).
# + tags=[]
fig, ax = plt.subplots(1, 2, figsize=(10, 15), width_ratios=[1, 1])
plot_map(
gdf=gdf, sel_coords=sel_coords,
sel_colors=sel_colors, ax=ax[0])
plot_venn(
subset_sizes=subset_sizes,
colors=sel_colors,
names=names,
ax=ax[1])
# store as png
fig.savefig(
Path.cwd().parents[0] / "resources" / f"hll_intersection_{ref_a}{ref_b}{ref_c}.png", dpi=300, format='PNG',
bbox_inches='tight', pad_inches=0)
# -
# Each dot on the left side is a coordinate shared in the original dataset, with an attached HLL set abstracting all users having shared photographs from this part of the world. By union of all HLL sets for each of the three country, based on coordinate-country inersection, we were able to calculate the intersection between different sets. These estimated common visitor counts are labeled on the right side in the Venn diagram.
#
# The number of distinct users who shared photos from the US by far outweights Germany and Canada. Not surprisingly, there is a larger overlap between common visitors for Canada and USA. By union of HLL sets, we are able to estimate distinct visitor counts for arbitrary areas or regions. This ability to re-use HLL sets is, however, limited by the lower limit of 50km resolution of the shared benchmark dataset.
# ## Create notebook HTML # ## Create notebook HTML
# + tags=[]
# !jupyter nbconvert --to html_toc \ # !jupyter nbconvert --to html_toc \
# --output-dir=../resources/html/ ./intro.ipynb \ # --output-dir=../resources/html/ ./intro.ipynb \
# --template=../nbconvert.tpl \ # --template=../nbconvert.tpl \
# --ExtractOutputPreprocessor.enabled=False >&- 2>&- # --ExtractOutputPreprocessor.enabled=False >&- 2>&-
# -
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment