New retrieval function

TODO: Clean up after old retrieval function, remove pca options etc..
parent c285d76c
......@@ -37,7 +37,7 @@ class ImageFolderWithPaths(torchvision.datasets.ImageFolder):
def __getitem__(self, item):
base_tuple = super(ImageFolderWithPaths, self).__getitem__(item)
filepath = self.imgs[item][0]
return base_tuple + (filepath,)
return base_tuple[0], filepath
def get_path_data_loader(dataset_path, batch_size=1, num_workers=4):
......
......@@ -3,6 +3,7 @@ from Training.dataloader import get_data_loaders, get_path_data_loader
from pathlib import Path
from Training.utils import score_match, visualize_match, calculate_pca, format_time
from sklearn.preprocessing import normalize
from multiprocessing import Pool
import torch
import numpy as np
import uuid
......@@ -16,7 +17,7 @@ CENTER_CROP_SIZE_KEYPOINT = 900
RANDOM_CROP_SITE_KEYPOINT = 720
BATCH_PRINT_FREQ = 20
MAX_PCA_FEATURES = 500000
NUM_VIS_MATCH = 10
class ExperimentManager:
"""
ExperimentManager: Controls the various stages of the experiments. parameters can be provided as single values or
......@@ -26,7 +27,7 @@ class ExperimentManager:
def __init__(self, experiment_name, stages, dataset, validation_split=0.2, batch_size=8, num_workers=4, epochs=30,
learning_rate=0.008, learning_rate_gamma=0.5, learning_rate_step_size=10, weight_decay=0.0001,
load_from=None, target_layer="layer3", use_l2_normalization=True, use_retrieval_normalization=True,
pca_dataset=None, pca_load=None, pca_log=False):
pca_dataset=None, pca_load=None, pca_log=False, visualize_top_matches= False):
print(f"Current working directory is {Path.cwd()}")
print(f"Running on CUDA:{torch.cuda.is_available()}")
print("Preparing experiment:")
......@@ -108,9 +109,11 @@ class ExperimentManager:
required_type=str,
required_in_stages={"finetuning", "keypoints", "retrieval"}
)
# TODO clean this up we are not doing different pca modes
self.pca_dataset = pca_dataset
self.pca_load = pca_load
self.pca_log = pca_log
self.visualize_top_matches = visualize_top_matches
if "retrieval" in self.stages:
if self.pca_dataset is not None:
self.pca_dataset = Path(pca_dataset)
......@@ -350,18 +353,22 @@ class ExperimentManager:
use_l2_normalization=self.use_l2_normalization).to(self.device)
model.eval()
# get path to all images
images = self.dataset["retrieval"].joinpath(Path("data"))
images = self.dataset["retrieval"]
# get path to query list
query = self.dataset["retrieval"].joinpath(Path("query.json"))
# create path to save results
stage_path = self.experiment_path.joinpath(Path("retrieval"))
result_path = stage_path.joinpath("query_results")
if not result_path.is_dir():
result_path.mkdir(parents=True)
print("Starting feature extraction")
with torch.no_grad():
accumulated_features = None
image_info = {}
data_loader = get_path_data_loader(images, num_workers=self.num_workers["retrieval"])
for ind, batch in enumerate(data_loader):
# get all image info from loader
image_input, label, path = batch
image_input, path = batch
image_input = image_input.to(self.device)
filename = Path(path[0]).stem
# get features and rf info from model
......@@ -374,11 +381,15 @@ class ExperimentManager:
else:
accumulated_features = torch.cat((accumulated_features, features), dim=0)
image_info[filename] = [features, rf_centers]
print(f"Gathered features of [{ind+1}/{len(data_loader)}] images")
accumulated_features = accumulated_features.numpy()
# if we have too many features for pca we choose a random subset
if len(accumulated_features) > MAX_PCA_FEATURES:
print(f"Number of gathered features exceeds maximum of {MAX_PCA_FEATURES}. Selected random"
f" subset for PCA calculation!")
accumulated_features = np.random.permutation(accumulated_features)[:MAX_PCA_FEATURES]
# calculate and evaluate the pca matrix
print("Training PCA Matrix")
pca = calculate_pca(accumulated_features, stage_path.joinpath(
f"{self.load_paths['retrieval'].stem}_{self.dataset['retrieval'].stem}.pca"), self.pca_log)
for filename in image_info:
......@@ -388,12 +399,33 @@ class ExperimentManager:
if self.use_retrieval_normalization:
features = normalize(features, norm='l2', axis=1)
image_info[filename] = [features, rf_centers]
"""
read from query file
multithread matching mit workern dafür dann kdtree in single thread stellen
ausgabe als json file pro query mit tripeln (filename, matching score, ransac score)
"""
print("Feature processing completed! Begin matching!")
with query.open("r") as query_file:
query_filenames = json.load(query_file)["queries"]
with Pool(self.num_workers["retrieval"]) as worker_pool:
for ind, query_filename in enumerate(query_filenames):
print(f"Matching query [{ind + 1}/{len(query_filenames)}]")
query_features, query_rf_centers = image_info[query_filename]
job_list = []
for index_filename in image_info:
index_features, index_rf_centers = image_info[index_filename]
job_list.append((index_features, query_features, index_rf_centers, query_rf_centers,
index_filename))
results = worker_pool.starmap(score_match, job_list)
if self.visualize_top_matches:
results.sort(key=lambda x: sum(x[0]), reverse=True)
for match_ind in range(NUM_VIS_MATCH):
matches, index_locations, query_locations, index_filename = results[match_ind]
index_filepath = images.joinpath(f"data/{index_filename}.jpg")
query_filepath = images.joinpath(f"data/{query_filename}.jpg")
visualize_match(index_filepath, query_filepath, index_locations, query_locations, matches)
# log index filename, number of verified matches, number of feature matches
query_result = [(match[3], sum(match[0]).item(), len(match[1])) for match in results]
log_data = {"query": query_filename, "results": query_result}
log_file = result_path.joinpath(query_filename)
with log_file.open('w', encoding='utf-8') as json_log:
json.dump(log_data, json_log, ensure_ascii=False)
def perform_retrieval(self):
# load model in retrieval mode
......@@ -540,10 +572,11 @@ def check_experiment_wide_parameter(parameter, parameter_name, required_type, al
def fire_experiment(experiment_name, stages, dataset, validation_split=0.2, batch_size=8, num_workers=4, epochs=30,
learning_rate=0.008, learning_rate_gamma=0.5, learning_rate_step_size=10, weight_decay=0.0001,
load_from=None, target_layer="layer3", use_l2_normalization=True, use_retrieval_normalization=True,
pca_dataset=None, pca_load=None, pca_log=False):
pca_dataset=None, pca_load=None, pca_log=False, visualize_top_matches= False):
ExperimentManager(experiment_name, stages, dataset, validation_split, batch_size, num_workers, epochs,
learning_rate, learning_rate_gamma, learning_rate_step_size, weight_decay, load_from,
target_layer, use_l2_normalization, use_retrieval_normalization, pca_dataset, pca_load, pca_log)
target_layer, use_l2_normalization, use_retrieval_normalization, pca_dataset, pca_load, pca_log,
visualize_top_matches)
#torch.backends.cudnn.benchmark = True
#exp = ExperimentManager("30_epoch_run", {"finetuning","keypoints","retrieval"}, {"finetuning": "../Datasets/Landmarks", "keypoints": "../../Datasets/Landmarks", "retrieval": "../../Datasets/Oxford"}, epochs=30)
......
......@@ -22,12 +22,10 @@ RANSAC_NUM_TRAILS = 1000
RANSAC_RESIDUAL_THRESHOLD = 12.5 #20
def score_match(index_features, query_features, index_locations, query_locations):
def score_match(index_features, query_features, index_locations, query_locations, index_filename):
index_tree = cKDTree(index_features)
distances, indices = index_tree.query(
query_features, distance_upper_bound=KD_TREE_DISTANCE_THRESHOLD, n_jobs=-1)
#print(distances)
#print(indices)
query_features, distance_upper_bound=KD_TREE_DISTANCE_THRESHOLD)
# Filter out features with no close neighbours
cleaned_query_locations = np.array([
query_locations[i, ] for i in range(query_locations.shape[0])
......@@ -38,11 +36,8 @@ def score_match(index_features, query_features, index_locations, query_locations
index_locations[indices[i], ] for i in range(query_locations.shape[0])
if indices[i] != index_locations.shape[0]
])
#print(cleaned_index_locations)
#print(cleaned_query_locations)
print(cleaned_query_locations.size)
if cleaned_query_locations.shape[0] <= RANSAC_MIN_SAMPLES:
return [False], None, None
return [False], None, None, index_filename
# Perform geometric verification using RANSAC.
# Ransac currently takes ~35 times longer than kd tree
model_robust, inliers = ransac(
......@@ -52,10 +47,8 @@ def score_match(index_features, query_features, index_locations, query_locations
residual_threshold=RANSAC_RESIDUAL_THRESHOLD,
max_trials=RANSAC_NUM_TRAILS)
if model_robust is None:
return [False], None, None
#print(sum(inliers))
#print(model_robust)
return inliers, cleaned_index_locations, cleaned_query_locations
return [False], None, None, index_filename
return inliers, cleaned_index_locations, cleaned_query_locations, index_filename
def get_receptive_boxes(height, width, scale, target_layer="layer3"):
......@@ -147,7 +140,7 @@ def calculate_pca(data, save_path=None, log_pca=False):
pca = PCA(n_components=PCA_COMPONENTS, whiten=True)
pca.fit(data)
explained_variance = sum(pca.explained_variance_ratio_)
print(f"calculated pca matrix. Explained variance is {explained_variance:.2f}"
print(f"Calculated pca matrix. Explained variance is {explained_variance:.2f}"
f" over {pca.n_components} components")
if save_path:
print(f"saving pca data to {save_path}")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment