[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[gnuastro-commits] master a7bfa5b 11/19: Match: k-d tree matching UI fix
From: |
Mohammad Akhlaghi |
Subject: |
[gnuastro-commits] master a7bfa5b 11/19: Match: k-d tree matching UI fixed, to do: finalizing docs and tests |
Date: |
Sun, 14 Nov 2021 20:40:59 -0500 (EST) |
branch: master
commit a7bfa5ba545056b68e9a594224d57b76506a5433
Author: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Commit: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Match: k-d tree matching UI fixed, to do: finalizing docs and tests
Until now, while most low-level aspects of the k-d tree based matching were
done, the high-level aspects and in paritcular the documentation wasn't
done yet.
With this commit, the UI now works in the various scenarios and the
documentation is mostly complete (in describing the algorithms). However,
the options are still not added and no tests have been added to 'make
check' yet.
---
bin/match/args.h | 15 ++-
bin/match/astmatch.conf | 2 +
bin/match/main.h | 6 +-
bin/match/match.c | 147 ++++++++++++++++++++---------
bin/match/ui.c | 144 +++++++++++++++++++++++++---
bin/match/ui.h | 1 +
doc/gnuastro.texi | 113 +++++++++++++++++-----
during-dev-test-data/kdtree-input.fits | Bin 8640 -> 0 bytes
during-dev-test-data/match-query.txt | 10 --
during-dev-test-data/scripts/gen-inputs.sh | 34 +++++++
10 files changed, 379 insertions(+), 93 deletions(-)
diff --git a/bin/match/args.h b/bin/match/args.h
index d567c0f..9297d1b 100644
--- a/bin/match/args.h
+++ b/bin/match/args.h
@@ -161,7 +161,7 @@ struct argp_option program_options[] =
UI_KEY_KDTREE,
"STR",
0,
- "build, internal, automatic, none, FILE.",
+ "build, internal, disable, CUSTOM-FITS-FILE.",
UI_GROUP_CATALOGMATCH,
&p->kdtree,
GAL_TYPE_STRING,
@@ -169,6 +169,19 @@ struct argp_option program_options[] =
GAL_OPTIONS_NOT_MANDATORY,
GAL_OPTIONS_NOT_SET,
},
+ {
+ "kdtreehdu",
+ UI_KEY_KDTREEHDU,
+ "STR",
+ 0,
+ "HDU of k-d tree when --kdtree=FILE.fits.",
+ UI_GROUP_CATALOGMATCH,
+ &p->kdtreehdu,
+ GAL_TYPE_STRING,
+ GAL_OPTIONS_RANGE_ANY,
+ GAL_OPTIONS_NOT_MANDATORY,
+ GAL_OPTIONS_NOT_SET,
+ },
diff --git a/bin/match/astmatch.conf b/bin/match/astmatch.conf
index 6e64203..2b89ea6 100644
--- a/bin/match/astmatch.conf
+++ b/bin/match/astmatch.conf
@@ -21,5 +21,7 @@
# Input
hdu2 1
+ kdtreehdu 1
# Catalog matching
+ kdtree internal
diff --git a/bin/match/main.h b/bin/match/main.h
index 9084357..bd958d2 100644
--- a/bin/match/main.h
+++ b/bin/match/main.h
@@ -33,6 +33,8 @@ along with Gnuastro. If not, see
<http://www.gnu.org/licenses/>.
#define PROGRAM_EXEC "astmatch" /* Program executable name. */
#define PROGRAM_STRING PROGRAM_NAME" (" PACKAGE_NAME ") " PACKAGE_VERSION
+/* Internal constants */
+#define MATCH_KDTREE_ROOT_KEY "KDTROOT"
enum match_modes
@@ -47,7 +49,6 @@ enum kdtree_modes
MATCH_KDTREE_INVALID, /* ==0 by default. */
MATCH_KDTREE_BUILD,
MATCH_KDTREE_INTERNAL,
- MATCH_KDTREE_AUTO,
MATCH_KDTREE_DISABLE,
MATCH_KDTREE_FILE,
};
@@ -68,6 +69,7 @@ struct matchparams
gal_data_t *outcols; /* Array of second input column names. */
gal_data_t *aperture; /* Acceptable matching aperture. */
char *kdtree; /* The mode to use k-d tree mode. */
+ char *kdtreehdu; /* k-d tree HDU when its a (FITS) file. */
uint8_t logasoutput; /* Don't rearrange inputs, out is log. */
uint8_t notmatched; /* Output is rows that don't match. */
@@ -84,6 +86,8 @@ struct matchparams
char *out2name; /* Name of second matched output. */
gal_list_str_t *stdinlines; /* Lines given by Standard input. */
int kdtreemode; /* The k-d tree mode. */
+ gal_data_t *kdtreedata;
+ size_t kdtreeroot;
/* Output: */
time_t rawtime; /* Starting time of the program. */
diff --git a/bin/match/match.c b/bin/match/match.c
index c398146..aaf53dc 100644
--- a/bin/match/match.c
+++ b/bin/match/match.c
@@ -36,6 +36,7 @@ along with Gnuastro. If not, see
<http://www.gnu.org/licenses/>.
#include <gnuastro/threads.h>
#include <gnuastro/permutation.h>
+#include <gnuastro-internal/timing.h>
#include <gnuastro-internal/checkset.h>
#include <main.h>
@@ -453,44 +454,42 @@ match_catalog_write_one_col(struct matchparams *p,
gal_data_t *a,
static void
match_catalog_kdtree_build(struct matchparams *p)
{
+ char *msg;
size_t root;
+ struct timeval t1;
gal_data_t *kdtree;
gal_fits_list_key_t *keylist=NULL;
/* Meta-data in the output fits file. */
char *unit = "index";
- char *keyname = "KDTROOT";
char *comment = "k-d tree root index (counting from 0).";
/* Construct a k-d tree from 'p->cols1': the index of root is stored in
'root'. */
+ if(!p->cp.quiet) gettimeofday(&t1, NULL);
kdtree = gal_kdtree_create(p->cols1, &root);
+ if(!p->cp.quiet)
+ {
+ if( asprintf(&msg, "k-d tree constructed (%zu rows).", p->cols1->size)<0
)
+ error(EXIT_FAILURE, errno, "asprintf allocation");
+ gal_timing_report(&t1, msg, 1);
+ free(msg);
+ }
/* Write the k-d tree to a file and write root index and input name
as FITS keywords ('gal_table_write' frees 'keylist'). */
gal_fits_key_list_title_add(&keylist, "k-d tree parameters", 0);
- gal_fits_key_write_filename("KDTIN", p->input1name, &keylist, 0);
- gal_fits_key_list_add_end(&keylist, GAL_TYPE_SIZE_T, keyname, 0,
+ gal_fits_key_write_filename("KDTIN", p->input1name, &keylist, 0,
+ p->cp.quiet);
+ gal_fits_key_list_add_end(&keylist, GAL_TYPE_SIZE_T,
+ MATCH_KDTREE_ROOT_KEY, 0,
&root, 0, comment, 0, unit, 0);
gal_table_write(kdtree, &keylist, NULL, GAL_TABLE_FORMAT_BFITS,
p->out1name, "kdtree", 0);
/* Let the user know that the k-d tree has been built. */
if(!p->cp.quiet)
- fprintf(stdout, "Output (k-d tree): %s\n", p->out1name);
-}
-
-
-
-
-
-/* See if a k-d tree should be used (MATCH_KDTREE_INTERNAL) or classical
- matching (MATCH_KDTREE_DISABLE). */
-static int
-match_catalog_kdtree_auto(struct matchparams *p)
-{
- error(EXIT_FAILURE, 0, "%s: not yet implemented!", __func__);
- return MATCH_KDTREE_INVALID;
+ fprintf(stdout, " - Output (k-d tree): %s\n", p->out1name);
}
@@ -502,15 +501,9 @@ match_catalog_kdtree_auto(struct matchparams *p)
static gal_data_t *
match_catalog_kdtree(struct matchparams *p, size_t *nummatched)
{
- size_t root;
+ char *msg;
+ struct timeval t1;
gal_data_t *out=NULL;
- gal_data_t *kdtree=NULL;
-
- /* If we are in automatic mode, we should look at the data (number of
- rows/columns) and system (number of threads) to decide if the mode
- should be set to 'MATCH_KDTREE_INTERNAL' or 'MATCH_KDTREE_DISABLE'. */
- if(p->kdtreemode==MATCH_KDTREE_AUTO)
- p->kdtreemode=match_catalog_kdtree_auto(p);
/* Operate according to the required mode. */
switch(p->kdtreemode)
@@ -521,13 +514,38 @@ match_catalog_kdtree(struct matchparams *p, size_t
*nummatched)
break;
/* Do the k-d tree matching. */
+ case MATCH_KDTREE_FILE:
case MATCH_KDTREE_INTERNAL:
- kdtree = gal_kdtree_create(p->cols1, &root);
- out = gal_match_kdtree(p->cols1, p->cols2, kdtree, root,
- p->aperture->array, p->cp.numthreads,
- p->cp.minmapsize, p->cp.quietmmap,
- nummatched);
- gal_list_data_free(kdtree);
+
+ /* If the k-d tree should be constructed internally, build it,
+ otherwise, we have already read an checked the k-d tree in 'ui.c',
+ so go directly to the matching. */
+ if(p->kdtreemode==MATCH_KDTREE_INTERNAL)
+ {
+ if(!p->cp.quiet) gettimeofday(&t1, NULL);
+ p->kdtreedata = gal_kdtree_create(p->cols1, &p->kdtreeroot);
+ if(!p->cp.quiet)
+ gal_timing_report(&t1, "Internal k-d tree constructed.", 1);
+ }
+
+ /* Do k-d tree based match. */
+ if(!p->cp.quiet)
+ {
+ gettimeofday(&t1, NULL);
+ printf(" - Match using the k-d tree ...\n");
+ }
+ out = gal_match_kdtree(p->cols1, p->cols2, p->kdtreedata,
+ p->kdtreeroot, p->aperture->array,
+ p->cp.numthreads, p->cp.minmapsize,
+ p->cp.quietmmap, nummatched);
+ if(!p->cp.quiet)
+ {
+ if( asprintf(&msg, "... done (%zu matches found).", *nummatched)<0 )
+ error(EXIT_FAILURE, errno, "asprintf allocation");
+ gal_timing_report(&t1, msg, 1);
+ free(msg);
+ }
+ gal_list_data_free(p->kdtreedata);
break;
/* No 'default' necessary because the modes include disabling. */
@@ -541,12 +559,48 @@ match_catalog_kdtree(struct matchparams *p, size_t
*nummatched)
+static gal_data_t *
+match_catalog_sort_based(struct matchparams *p, size_t *nummatched)
+{
+ char *msg;
+ gal_data_t *mcols;
+ struct timeval t1;
+
+ /* Let the user know that the matching has started. */
+ if(!p->cp.quiet)
+ {
+ gettimeofday(&t1, NULL);
+ printf(" - Matching by sorting ...\n");
+ }
+
+ /* Do the matching. */
+ mcols=gal_match_sort_based(p->cols1, p->cols2, p->aperture->array,
+ 0, 1, p->cp.minmapsize, p->cp.quietmmap,
+ nummatched);
+
+ /* Let the user know that it finished. */
+ if(!p->cp.quiet)
+ {
+ if( asprintf(&msg, "... done (%zu matches found).", *nummatched)<0 )
+ error(EXIT_FAILURE, errno, "asprintf allocation");
+ gal_timing_report(&t1, msg, 1);
+ free(msg);
+ }
+
+ /* Return the permutations. */
+ return mcols;
+}
+
+
+
+
+
static void
match_catalog(struct matchparams *p)
{
uint32_t *u, *uf;
- gal_data_t *tmp, *mcols;
- gal_data_t *a=NULL, *b=NULL;
+ struct timeval t1;
+ gal_data_t *tmp, *a=NULL, *b=NULL, *mcols=NULL;
size_t nummatched, *acolmatch=NULL, *bcolmatch=NULL;
/* If we want to use kd-tree for matching. */
@@ -560,18 +614,22 @@ match_catalog(struct matchparams *p)
if(p->kdtreemode==MATCH_KDTREE_BUILD) return;
}
- /* Find the matching coordinates. We are doing the processing in
- place. Incase it was decided not to use a k-d tree at all
- (in 'automatic' mode), then we need to use the classic mode. */
+ /* If k-d tree-based matching wasn't used, use the sort-based
+ matching. */
if(mcols==NULL)
- mcols=gal_match_sort_based(p->cols1, p->cols2, p->aperture->array,
- 0, 1, p->cp.minmapsize, p->cp.quietmmap,
- &nummatched);
+ mcols=match_catalog_sort_based(p, &nummatched);
/* If the output is to be taken from the input columns (it isn't just the
log), then do the job. */
if(p->logasoutput==0)
{
+ /* Let the user know what is happening. */
+ if(!p->cp.quiet)
+ {
+ gettimeofday(&t1, NULL);
+ printf(" - Re-arranging matched rows (skip this with
'--logasoutput')...\n");
+ }
+
/* Read (and possibly write) the outputs. Note that we only need to
read the table when it is necessary for the output (the user might
have asked for '--outcols', only with columns of one of the two
@@ -604,6 +662,10 @@ match_catalog(struct matchparams *p)
/* Clean up. */
if(a) gal_list_data_free(a);
if(b) gal_list_data_free(b);
+
+ /* Let the user know. */
+ if( !p->cp.quiet )
+ gal_timing_report(&t1, "... done.", 1);
}
/* Write the raw information in a log file if necessary. */
@@ -655,12 +717,11 @@ match_catalog(struct matchparams *p)
/* Print the number of matches if not in quiet mode. */
if(!p->cp.quiet)
{
- fprintf(stdout, "Number of matching rows in both catalogs: %zu\n",
- nummatched);
if(p->out2name && strcmp(p->out1name, p->out2name))
- fprintf(stdout, "Output:\n %s\n %s\n", p->out1name, p->out2name);
+ fprintf(stdout, " - Output-1: %s\n - Output-2: %s\n",
+ p->out1name, p->out2name);
else
- fprintf(stdout, "Output: %s\n", p->out1name);
+ fprintf(stdout, " - Output: %s\n", p->out1name);
}
}
diff --git a/bin/match/ui.c b/bin/match/ui.c
index bc4eb70..f260dba 100644
--- a/bin/match/ui.c
+++ b/bin/match/ui.c
@@ -29,6 +29,7 @@ along with Gnuastro. If not, see
<http://www.gnu.org/licenses/>.
#include <string.h>
#include <gnuastro/fits.h>
+#include <gnuastro/threads.h>
#include <gnuastro-internal/timing.h>
#include <gnuastro-internal/options.h>
@@ -107,6 +108,7 @@ ui_initialize_options(struct matchparams *p,
cp->program_exec = PROGRAM_EXEC;
cp->program_bibtex = PROGRAM_BIBTEX;
cp->program_authors = PROGRAM_AUTHORS;
+ cp->numthreads = gal_threads_number();
cp->coptions = gal_commonopts_options;
@@ -120,7 +122,6 @@ ui_initialize_options(struct matchparams *p,
cp->coptions[i].doc="Extension name or number of first input.";
break;
case GAL_OPTIONS_KEY_TYPE:
- case GAL_OPTIONS_KEY_NUMTHREADS:
cp->coptions[i].flags=OPTION_HIDDEN;
break;
}
@@ -223,17 +224,15 @@ ui_read_check_only_options(struct matchparams *p)
/* Set the k-d tree mode. */
if( !strcmp(p->kdtree,"build") ) p->kdtreemode=MATCH_KDTREE_BUILD;
else if( !strcmp(p->kdtree,"internal") )
p->kdtreemode=MATCH_KDTREE_INTERNAL;
- else if( !strcmp(p->kdtree,"auto") ) p->kdtreemode=MATCH_KDTREE_AUTO;
else if( !strcmp(p->kdtree,"disable") )
p->kdtreemode=MATCH_KDTREE_DISABLE;
else if( gal_fits_name_is_fits(p->kdtree) )
p->kdtreemode=MATCH_KDTREE_FILE;
else
error(EXIT_FAILURE, 0, "'%s' is not valid for '--kdtree'. The "
"following values are accepted: 'build' (to build the k-d tree in "
"the file given to '--output'), 'internal' (to force internal "
- "usage of a k-d tree for the matching), 'auto' (to decide "
- "automatically if a k-d tree should be used or not), 'disable' "
- "(to not use a k-d tree at all), a FITS file name (the file to "
- "read a created k-d tree from)", p->kdtree);
+ "usage of a k-d tree for the matching), 'disable' (to not use a "
+ "k-d tree at all), a FITS file name (the file to read a created "
+ "k-d tree from)", p->kdtree);
/* Make sure that the k-d tree build mode is not called with
'--outcols'. */
@@ -243,6 +242,17 @@ ui_read_check_only_options(struct matchparams *p)
"tree building mode doesn't involve actual matching. It will "
"only build k-d tree and write it to a file so it can be used "
"in future matches)");
+
+ /* Make sure that a HDU is also specified for the k-d tree when its an
+ external file. */
+ if( p->kdtreemode==MATCH_KDTREE_FILE && p->kdtreehdu==NULL )
+ error(EXIT_FAILURE, 0, "no '--kdtreehdu' specified! When a FITS "
+ "file is given to '--kdtree', you should specify its "
+ "extension/HDU within the file using '--kdtreehdu'. You "
+ "can either use the HDU name, or its number (counting "
+ "from 0). If you aren't sure what HDUs exist in the file, "
+ "you can use the 'astfits %s' command to see the full list",
+ p->kdtree);
}
}
@@ -271,10 +281,9 @@ ui_check_options_and_arguments(struct matchparams *p)
/* Print a warning to let the user know the option will not be
used (the user had probably confused things if they have given
it). */
- error(EXIT_SUCCESS, 0, "WARNING: with '--coord' you only need "
- "to set '--ccol1' (for the table), '--ccol2' (for the "
- "'--coord') is not needed because you are directly "
- "giving the coordinates in any order you want");
+ error(EXIT_SUCCESS, 0, "WARNING: with '--coord' or '--kdtree', "
+ "you only need to set '--ccol1' (since there is only a "
+ "single input table), '--ccol2' is not needed");
}
}
@@ -767,6 +776,70 @@ ui_read_columns_to_double(struct matchparams *p, char
*filename, char *hdu,
+#define UI_READ_KDTREE_FIXED_MSG "You can construct a k-d tree " \
+ "for your first input table using the command below (just set your " \
+ "desired output name, and give that file to '--kdtree', in a later " \
+ "call with the same first input catalog):\n\n" \
+ " astmatch %s -h%s --ccol1=%s,%s --kdtree=build --output=KDTREE.fits\n\n"
\
+ "To learn more about the expected format of k-d trees in Gnuastro, " \
+ "please run this command: 'info gnuastro k-d'"
+
+static void
+ui_read_kdtree(struct matchparams *p)
+{
+ size_t *sizetarr;
+ char **strarr1 = p->ccol1->array;
+ gal_data_t *keysll=gal_data_array_calloc(1);
+
+ /* Read the external k-d tree. */
+ p->kdtreedata=gal_table_read(p->kdtree, p->kdtreehdu, NULL,
+ NULL, GAL_TABLE_SEARCH_NAME, 1,
+ p->cp.minmapsize, p->cp.quietmmap, NULL);
+
+ /* It has to have only two columns, with an unsigned 32-bit integer
+ type in each. */
+ if( gal_list_data_number(p->kdtreedata)!=2 )
+ error(EXIT_FAILURE, 0, "%s (hdu: %s, that was given to "
+ "'--kdtree') has %zu column(s), but should contain 2 "
+ "columns. " UI_READ_KDTREE_FIXED_MSG,
+ p->kdtree, p->kdtreehdu, gal_list_data_number(p->kdtreedata),
+ p->input1name, p->cp.hdu, strarr1[0], strarr1[1]);
+ if( p->kdtreedata->type!=GAL_TYPE_UINT32
+ || p->kdtreedata->next->type!=GAL_TYPE_UINT32 )
+ error(EXIT_FAILURE, 0, "%s (hdu: %s, that was given to "
+ "'--kdtree') doesn't have unsigned 32-bit integer columns. "
+ UI_READ_KDTREE_FIXED_MSG, p->kdtree, p->kdtreehdu,
+ p->input1name, p->cp.hdu, strarr1[0], strarr1[1]);
+ if( p->kdtreedata->size!=p->cols1->size )
+ error(EXIT_FAILURE, 0, "%s (hdu: %s, that was given to "
+ "'--kdtree') doesn't have the same number of rows as "
+ "%s, which is the first given input. "
+ UI_READ_KDTREE_FIXED_MSG, p->kdtree, p->kdtreehdu,
+ gal_fits_name_save_as_string(p->input1name, p->cp.hdu),
+ p->input1name, p->cp.hdu, strarr1[0], strarr1[1]);
+
+ /* Read the k-d tree root. */
+ keysll[0].type=GAL_TYPE_SIZE_T;
+ keysll[0].name=MATCH_KDTREE_ROOT_KEY;
+ gal_fits_key_read(p->kdtree, p->kdtreehdu, keysll, 0, 0);
+ if(keysll[0].status)
+ error(EXIT_FAILURE, 0, "%s (hdu: %s, that was given to "
+ "'--kdtree') doesn't have the '%s' keyword, or it "
+ "couldn't be read as a number. " UI_READ_KDTREE_FIXED_MSG,
+ p->kdtree, p->kdtreehdu, MATCH_KDTREE_ROOT_KEY,
+ p->input1name, p->cp.hdu, strarr1[0], strarr1[1]);
+ sizetarr=keysll[0].array;
+ p->kdtreeroot=sizetarr[0];
+
+ /* Clean up: since the 'name' component wasn't allocated in this example,
+ we should set it to NULL before calling 'gal_data_array_free'. */
+ keysll[0].name=NULL;
+ gal_data_array_free(keysll, 1, 1);
+}
+
+
+
+
/* Read catalog columns */
static void
@@ -802,6 +875,26 @@ ui_read_columns(struct matchparams *p)
: ui_read_columns_to_double(p, p->input2name, p->hdu2,
cols2, ndim) );
+ /* If an external k-d tree is given, read it and make sure it has the
+ same number of rows as the first input and the proper datatype. */
+ if( p->kdtreemode==MATCH_KDTREE_FILE )
+ ui_read_kdtree(p);
+
+ /* If we are in k-d tree based matching and the second dataset is smaller
+ than the first, print a warning to let the user know that the speed
+ can be greatly improved if they swap the two. */
+ if( !p->cp.quiet
+ && p->kdtreemode!=MATCH_KDTREE_DISABLE
+ && p->cols1->size > p->cols2->size )
+ error(EXIT_SUCCESS, 0, "TIP: the matching speed will GREATLY IMPROVE "
+ "if you swap the two inputs. Currently the second input has "
+ "fewer rows than the first. In the k-d tree based matching, "
+ "multi-threading will occur on the second input's rows and "
+ "the k-d will be constructed for the first input. Fewer "
+ "first-input rows therefore means a smaller tree, and more "
+ "second-input rows will be better distributed in your "
+ "system's CPU");
+
/* Free the extra spaces. */
gal_list_str_free(cols1, 1);
gal_list_str_free(cols2, 1);
@@ -1058,6 +1151,7 @@ ui_preparations(struct matchparams *p)
void
ui_read_check_inputs_setup(int argc, char *argv[], struct matchparams *p)
{
+ size_t nthreads;
struct gal_options_common_params *cp=&p->cp;
@@ -1110,7 +1204,32 @@ ui_read_check_inputs_setup(int argc, char *argv[],
struct matchparams *p)
/* If the output is a FITS table, prepare all the options as FITS
keywords to write in output later. */
if(gal_fits_name_is_fits(p->out1name))
- gal_options_as_fits_keywords(&p->cp);
+ gal_options_as_fits_keywords(&p->cp);
+
+
+ /* Report the starting information. */
+ /* Let the user know that processing has started. */
+ if(!p->cp.quiet)
+ {
+ printf(PROGRAM_NAME" "PACKAGE_VERSION" started on %s",
+ ctime(&p->rawtime));
+ nthreads=p->kdtreemode==MATCH_KDTREE_DISABLE ? 1 : p->cp.numthreads;
+ printf(" - Using %zu CPU thread%s%s\n", nthreads,
+ nthreads==1 ? "." : "s.",
+ ( p->kdtreemode==MATCH_KDTREE_DISABLE
+ ? " (sort-based match only uses a single thread)" : ""));
+ printf(" - Match algorithm: %s\n",
+ p->kdtree ? "k-d tree" : "sort-based");
+ printf(" - Input-1: %s\n",
+ gal_fits_name_save_as_string(p->input1name, p->cp.hdu));
+ if(p->kdtreemode==MATCH_KDTREE_FILE)
+ printf(" - Input-1 k-d tree: %s\n",
+ gal_fits_name_save_as_string(p->kdtree, p->kdtreehdu));
+ if(p->kdtreemode!=MATCH_KDTREE_BUILD)
+ printf(" - Input-2: %s\n",
+ p->coord ? "from --coord"
+ : gal_fits_name_save_as_string(p->input2name, p->hdu2));
+ }
}
@@ -1153,8 +1272,7 @@ ui_free_report(struct matchparams *p, struct timeval *t1)
gal_list_str_free(p->bcols, 0);
gal_list_str_free(p->stdinlines, 1);
- /* Print the final message.
+ /* Print the final message. */
if(!p->cp.quiet)
gal_timing_report(t1, PROGRAM_NAME" finished in: ", 0);
- */
}
diff --git a/bin/match/ui.h b/bin/match/ui.h
index bf42f70..0f33a5d 100644
--- a/bin/match/ui.h
+++ b/bin/match/ui.h
@@ -58,6 +58,7 @@ enum option_keys_enum
automatically). */
UI_KEY_NOTMATCHED = 1000,
UI_KEY_OUTCOLS,
+ UI_KEY_KDTREEHDU,
};
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index cb1accb..566de0e 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -576,6 +576,7 @@ Invoking MakeCatalog
Match
+* Matching algorithms::
* Invoking astmatch:: Inputs, outputs and options of Match
Modeling and fitting
@@ -19951,10 +19952,65 @@ The nearest objects in the two catalogs, within the
given aperture, will be foun
The aperture can be a circle or an ellipse with any orientation.
@menu
+* Matching algorithms:: Different ways to find the match
* Invoking astmatch:: Inputs, outputs and options of Match
@end menu
-@node Invoking astmatch, , Match, Match
+@node Matching algorithms, Invoking astmatch, Match, Match
+@subsection Matching algorithms
+
+Matching involves two catalogs, let's call them catalog A (with N rows) and
catalog B (with M rows).
+The most basic algorithm that immediately comes to mind is this:
+for each row in A (let's call it @mymath{A_i}), go over all the rows in B and
calculate the distance to each and find the @mymath{B_j} that is nearest to
@mymath{A_i}.
+If the distance between @mymath{A_i} and @mymath{B_j} is below a certain
acceptable threshold (or radius), consider them as a match.
+
+You may think of this (initially genius!) solution: During the parsing, when I
find that @mymath{A_i} and @mymath{B_j} satisfy the match above, I will remove
@mymath{B_j} from the search of matches for @mymath{A_k} (where @mymath{k> i}).
+As a result, as I go down A and more matches are found, I have to calculate
less distances (there are fewer elements in B that remain to be checked).
+However, this will introduce an important bias: @mymath{B_j} may actually be
closer to @mymath{A_k}!
+But because @mymath{A_i} happened to be before @mymath{A_k} in your table, you
removed @mymath{B_j} from the potential search domain of @mymath{A_k}.
+You will therefore loose that match, and replace it by a false match between
@mymath{A_i} and @mymath{B_j}!
+
+In a single-dimentional match, this bias depends on the sorting of your two
datasets (leading to different matches if you shuffle your datasets).
+But it will get more complex as you add dimensionality (for example catalogs
dervied from 2D images or 3D cubes, where you have 2 and 3 different
coordinates for each point).
+Here is how we adress this problem in Gnuastro's Match program: as we are
doing the first parsing, we keep a log of all elements in A (@mymath{A_i} and
@mymath{A_k} in the example above) that were within the radial threshold of
@mymath{B_j}.
+Afterwards, for each @mymath{B_j} we can find the nearest element of A,
irrespective of how the inputs were sorted, or their dimensionality.
+
+On the other hand, the basic parsing algorithm mentioned above is very
computationally expensive:
+@mymath{N\times M} distances have to measured, and calculating the distance
requires a square root and power of 2, which are not simple operations for the
CPU.
+As a result, this basic algorithm will become terribly slow as your datasets
grow in size.
+For example when N or M exceed hundreds of thousands (which is common in the
current days with datasets like the European Space Agency's Gaia mission).
+Therefore that basic parsing algoritm won't work and we need to use more
efficient ways to parse the two catalogs.
+Gnuastro's Match currently has two such algorithms:
+
+@table @asis
+@item Sort-based
+To avoid the problem of calculating @mymath{N\times M} Euclidean distances
(that involve a square root), this method works like this:
+
+@enumerate
+@item
+Sort the two datasets by their first coordinate.
+Therefore @mymath{A_i<A_j} (only in first coordinate; when @mymath{i<j}), and
similarly for the elements of B.
+@item
+Use the radial distance threshold to define a moving window of B over A.
+Therefore, within a single parsing of A, you can find all the elements in A
that are sufficiently near every @mymath{B_j}.
+@item
+The nearest A within the subset that is nearest to @mymath{B_j} will be
considered as the match.
+@end enumerate
+
+@item k-d tree based
+The k-d tree concept is much more abstract, but powerful (for example enabling
parallel processing, that was not possible in the sort-based method above).
+In short a k-d tree is a partitioning of a k-dimentional space.
+For more on the k-d tree and Gnuastro's implementation of it, please see
@ref{K-d tree}.
+
+To avoid getting into too much theory, let's look at it from a user's
perspective: Match will internally construct a k-d tree for catalog A (the
first catalog given to it).
+Optionally, you can also build the k-d tree of A with a separate call to Match
(with @option{--kdtree=build} and @option{--output=mykdtree.fits}).
+This external k-d tree can be fed to Match later, when doing your matches (to
avoid having to reconstruct it every time you want to match with the same first
catalog; using @option{--kdtree=mykdtree.fits}).
+
+Each thread on your CPU will then be associated a list of rows from catalog B
and the k-d tree is parsed independently for those to find the nearest element
of A to that row in B.
+@end table
+
+
+@node Invoking astmatch, , Matching algorithms, Match
@subsection Invoking Match
When given two catalogs, Match finds the rows that are nearest to each other
within an input aperture.
@@ -29335,20 +29391,21 @@ have any type), see above for the definition of
permutation.
@cindex Matching
@cindex Coordinate matching
Matching is often necessary when two measurements of the same points have been
done using different instruments (or hardware), different software or different
configurations of the same software.
-In other words, you have two catalogs or tables and each has N columns
containing the N-dimensional ``positional'' values of each point.
-Each can have other columns too, for example one can have brightness
measurements in one filter, and another can have brightness measurements in
another filter as well as morphology measurements or etc.
+In other words, you have two catalogs or tables, and each has N columns
containing the N-dimensional ``coordinate'' values of each point.
+Each table can have other columns too, for example one can have brightness
measurements in one filter, and another can have morphology measurements or etc.
-The matching functions here will use the positional columns to find the
permutation necessary to apply to both tables.
-This will enable you to match by the positions, then apply the permutation to
the brightness or morphology columns in the example above.
-The input and output data formats of the functions below are the some and
described below before the actual functions.
+The matching functions here will use the coordinate columns of the two tables
to find a permutation for each, and the total number of matched rows
(@mymath{N_{match}}).
+This will enable you to match by the positions if you like.
+A a higher level, you can apply the permutation to the brightness or
morphology columns to merge the catalogs over the @mymath{N_{match}} rows.
+The input and output data formats of the functions are the some and described
below before the actual functions.
Each function also has extra arguments due to the particular algorithm it uses
for the matching.
The two inputs of the functions (@code{coord1} and @code{coord2}) must be
@ref{List of gal_data_t}.
-Each @code{gal_data_t} node in @code{coord1} or @code{coord2} should be a
single dimensional dataset (column in a table) and all the nodes must have the
same number of elements (rows).
+Each @code{gal_data_t} node in @code{coord1} or @code{coord2} should be a
single dimensional dataset (column in a table) and all the nodes (in each) must
have the same number of elements (rows).
In other words, each column can be visualized as having the coordinates of
each point in its respective dimension.
The dimensions of the coordinates is determined by the number of
@code{gal_data_t} nodes in the two input lists (which must be equal).
-The number of rows (or the number of elements in each @code{gal_data_t}) in
the columns of @code{coord1} and @code{coord2} can be different.
-All these functions will all be satisfied if you use @code{gal_table_read} to
read the two coordinate columns, see @ref{Table input output}.
+The number of rows (or the number of elements in each @code{gal_data_t}) in
the columns of @code{coord1} and @code{coord2} can (and, usually will!) be
different.
+In summary, these functions will be happy if you use @code{gal_table_read} to
read the two coordinate columns from a file, see @ref{Table input output}.
@cindex Permutation
The functions below return a simply-linked list of three 1D datasets (see
@ref{List of gal_data_t}), let's call the returned dataset @code{ret}.
@@ -29360,20 +29417,24 @@ If there wasn't any match, this function will return
@code{NULL}.
The two permutations can be applied to the rows of the two inputs: the first
one (@code{ret}) should be applied to the rows of the table containing
@code{coord1} and the second one (@code{ret->next}) to the table containing
@code{coord2}.
After applying the returned permutations to the inputs, the top
@code{nummatched} elements of both will match with each other.
-The ordering of the rest of the elements is undefined (depends on the matching
funciton used).
+The ordering of the rest of the elements is undefined (depends on the matching
function used).
The third node is the distances between the respective match (which may be
elliptical distance, see discussion of ``aperture'' below).
The functions will not simply return the nearest neighbor as a match.
-The nearest neighbor may be too far to be a meaningful.
-They will check the distance between the distance of the nearest neighbor of
each point and only return a match for it it is within an acceptable
N-dimensional distance (or ``aperture'').
+This is because the nearest neighbor may be too far to be a meaningful!
+They will check the distance between the nearest neighbor of each point and
only return a match if it is within an acceptable N-dimensional distance (or
``aperture'').
The matching aperture is defined by the @code{aperture} array that is an input
argument to the functions.
-If several points of one catalog lie within this aperture of a point in the
other, the nearest is defined as the match.
-In a 2D situation (where the input lists have two nodes), for the most generic
case, it must have three elements: the major axis length, axis ratio and
position angle (see @ref{Defining an ellipse and ellipsoid}).
+
+If several points of one catalog lie within this aperture of a point in the
other catalog, the nearest is defined as the match.
+In a 2D situation (where the input lists have two nodes), for the most generic
case, @code{aperture} must have three elements: the major axis length, axis
ratio and position angle (see @ref{Defining an ellipse and ellipsoid}).
If @code{aperture[1]==1}, the aperture will be a circle of radius
@code{aperture[0]} and the third value won't be used.
When the aperture is an ellipse, distances between the points are also
calculated in the respective elliptical distances (@mymath{r_{el}} in
@ref{Defining an ellipse and ellipsoid}).
-
-
+@strong{Output permutations ignore internal sorting}: the output permutations
will correspond to the initial inputs.
+Therefore, even when @code{inplace!=0} (and this function re-arranges the
inputs in place), the output permutation will correspond to original (possibly
non-sorted) inputs. The reason for this is that you rarely want to permute the
actual positional columns after the match.
+Usually, you also have other columns (for example the brightness, morphology
and etc) and you want to find how they differ between the objects that match.
+Once you have the permutations, they can be applied to those other columns
(see @ref{Permutations}) and the higher-level processing can continue.
+So if you don't need the coordinate columns for the rest of your analysis, it
is better to set @code{inplace=1}.
@deftypefun {gal_data_t *} gal_match_sort_based (gal_data_t @code{*coord1},
gal_data_t @code{*coord2}, double @code{*aperture}, int @code{sorted_by_first},
int @code{inplace}, size_t @code{minmapsize}, int @code{quietmmap}, size_t
@code{*nummatched})
@@ -29386,17 +29447,19 @@ When sorting is necessary and @code{inplace} is
non-zero, the actual input colum
Otherwise, an internal copy of the inputs will be made, used (sorted) and
later freed before returning.
Therefore, when @code{inplace==0}, inputs will remain untouched, but this
function will take more time and memory.
If internal allocation is necessary and the space is larger than
@code{minmapsize}, the space will be not allocated in the RAM, but in a file,
see description of @option{--minmapsize} and @code{--quietmmap} in
@ref{Processing options}.
+@end deftypefun
-@cartouche
-@noindent
-@strong{Output permutations ignore internal sorting}: the output permutations
will correspond to the initial inputs.
-Therefore, even when @code{inplace!=0} (and this function re-arranges the
inputs in place), the output permutation will correspond to original (possibly
non-sorted) inputs.
+@deftypefun {gal_data_t *} gal_match_kdtree (gal_data_t @code{*coord1},
gal_data_t @code{*coord2}, gal_data_t @code{*coord1_kdtree}, size_t
@code{kdtree_root}, double @code{*aperture}, size_t @code{numthreads}, size_t
@code{minmapsize}, int @code{quietmmap}, size_t @code{*nummatched})
+
+@cindex Matching by k-d tree
+@cindex k-d tree matching
+Use the k-d tree concept for finding matches between two catalogs, optionally
in parallel (on @code{numthreads} threads).
+The k-d tree of the first input (@code{coord1_kdtree}), and its root index
(@code{kdtree_root}), should be constructed and found before calling this
function, to do this, you can use the @code{gal_kdtree_create} of @ref{K-d
tree}.
+The desired @code{aperture} array is the same as @code{gal_match_sort_based}
and discribed at the top of this section.
+
+The final number of matches is returned in @code{nummatched} and the format of
the returned dataset (three columns) is described above.
+If internal allocation is necessary and the space is larger than
@code{minmapsize}, the space will be not allocated in the RAM, but in a file,
see description of @option{--minmapsize} and @code{--quietmmap} in
@ref{Processing options}.
-The reason for this is that you rarely want to permute the actual positional
columns after the match.
-Usually, you also have other columns (for example the brightness, morphology
and etc) and you want to find how they differ between the objects that match.
-Once you have the permutations, they can be applied to those other columns
(see @ref{Permutations}) and the higher-level processing can continue.
-So if you don't need the coordinate columns for the rest of your analysis, it
is better to set @code{inplace=1}.
-@end cartouche
@end deftypefun
@node Statistical operations, Binary datasets, Matching, Gnuastro library
diff --git a/during-dev-test-data/kdtree-input.fits
b/during-dev-test-data/kdtree-input.fits
deleted file mode 100644
index d3780eb..0000000
Binary files a/during-dev-test-data/kdtree-input.fits and /dev/null differ
diff --git a/during-dev-test-data/match-query.txt
b/during-dev-test-data/match-query.txt
deleted file mode 100644
index 394bf51..0000000
--- a/during-dev-test-data/match-query.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-14.55495528995644 13.952740903340256 14.512879995960738
-14.156180015003525 12.80845342418684 14.826212427784835
-13.505267742341026 13.013443271481133 11.183241471767612
-13.09715467421369 10.936844869289065 10.692733635319755
-11.855001056969742 14.100516383294387 11.839440238220615
-11.766844757741728 10.752701002494069 12.958678702895487
-12.455862588902148 10.943892162638429 11.946732314414476
-13.386374627834787 11.49741032889326 13.000603833689892
-14.49239538261452 11.133665168041762 12.959051224814937
-10.968887621940603 13.55609990177711 14.863758607257214
diff --git a/during-dev-test-data/scripts/gen-inputs.sh
b/during-dev-test-data/scripts/gen-inputs.sh
new file mode 100755
index 0000000..21cd7e1
--- /dev/null
+++ b/during-dev-test-data/scripts/gen-inputs.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# If you want to use a real catalog as reference (and manually add noise to
+# randomly selected rows, give a value to the 'real' variable). If this is
+# empty, a mock table with two columns of integers as input will be
+# created.
+real=~/tmp/gaia.fits
+real_c1=ra1
+real_c2=dec1
+
+# Number of points (number of inputs in mock, and number of rows in noised
+# catalog for real).
+num=300000
+
+# Scatter around each coordinate to add and aperture size for mock and real
+# tables.
+sigma_mock=1
+sigma_real=0.000555556 # 2 arcsecond (in degrees)
+
+# Build the base catalog input
+if [ x"$real" = x ]; then
+ sigma=$sigma_mock
+ for i in $(seq 1 $num); do
+ echo "$i $i"
+ done | asttable ../base.fits
+else
+ sigma=$sigma_real
+ asttable $real -c$real_c1,$real_c2 --output=../base.fits
+fi
+
+# Add noise to both columns to build the new dataset.
+asttable ../base.fits -c'arith $1 '$sigma' mknoise-sigma' \
+ -c'arith $2 '$sigma' mknoise-sigma' --rowrandom=$num \
+ -o../base-noised.fits
- [gnuastro-commits] master 7354e33 09/19: Library (match.h): match_coordinate_ replaced by match_sort_based_, (continued)
- [gnuastro-commits] master 7354e33 09/19: Library (match.h): match_coordinate_ replaced by match_sort_based_, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 03d1a25 15/19: Library (fits.h): corrected segmentation fault in parallel reading, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 336ddee 04/19: Error in `match_kdtree_worker` in lib/match.c, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 2877acb 05/19: Added fits files for testing in `during-dev-test-data`, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 48d760d 12/19: Library (match.h): k-d tree method discards regions with no overlap, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master d0b19b8 10/19: Match: make check script for k-d tree matching now executable, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 095c788 13/19: Match: matched rows aren't permuted, but new column allocated, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 9e258a8 14/19: Library (fits.h): reading table columns now done in parallel, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 29f8b20 16/19: Table and Arithmetic: corrected some memory leaks, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 8b17675 06/19: Corrected segmentation fault, included aperture, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master a7bfa5b 11/19: Match: k-d tree matching UI fixed, to do: finalizing docs and tests,
Mohammad Akhlaghi <=
- [gnuastro-commits] master f5d7d1a 19/19: Match: added tests, completed docs of new k-d tree based matching, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 6f7ff61 08/19: Library (match.h): Using the old infra-structure for double-matches, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master 68823e3 17/19: Library (fits.h): correctly reading columns of 0 width/repeat, Mohammad Akhlaghi, 2021/11/14
- [gnuastro-commits] master a6b838c 18/19: Match: script to generate debugging input taken in bin/match/, Mohammad Akhlaghi, 2021/11/14