[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[gnuastro-commits] master 05f7d4e 1/2: Statistics library function to re
From: |
Mohammad Akhlaghi |
Subject: |
[gnuastro-commits] master 05f7d4e 1/2: Statistics library function to remove outliers |
Date: |
Sun, 19 Aug 2018 19:09:50 -0400 (EDT) |
branch: master
commit 05f7d4ee822d6d8b846088c234cf73b086fd25ac
Author: Mohammad Akhlaghi <address@hidden>
Commit: Mohammad Akhlaghi <address@hidden>
Statistics library function to remove outliers
A new statistics library function called `gal_statistics_outlier_positive'
is now available in the library to help in identifying and removing
outliers from a distribution.
---
NEWS | 1 +
doc/gnuastro.texi | 40 ++++++++++++++
lib/gnuastro/statistics.h | 5 +-
lib/statistics.c | 132 +++++++++++++++++++++++++++-------------------
4 files changed, 123 insertions(+), 55 deletions(-)
diff --git a/NEWS b/NEWS
index f4886d1..061735f 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,7 @@ GNU Astronomy Utilities NEWS -*-
outline -*-
-- gal_fits_key_write_in_ptr: New name of gal_fits_key_write.
-- gal_fits_key_write_version_in_ptr: new name of
gal_fits_key_write_version.
-- gal_fits_key_write_config: write key list and version as config header.
+ -- gal_statistics_outlier_positive: Find the first positive outlier.
** Removed features
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index c24683d..fa8987a 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -27191,6 +27191,46 @@ above.
@end deftypefun
address@hidden {gal_data_t *} gal_statistics_outlier_positive (gal_data_t
@code{*input}, size_t @code{window_size}, float @code{sigma}, float
@code{sigclip_multip}, float @code{sigclip_param}, int @code{inplace})
+Find the first positive outlier in the @code{input} distribution. The
+returned dataset contains a single element: the first positive outlier. It
+is one of the dataset's elements, in the same type as the input. If the
+process fails for any reason (for example @code{window_size} is larger than
+the number of @code{input}'s non-blank elements, or no outlier was found),
+a @code{NULL} pointer will be returned.
+
+All (possibly existing) blank elements are first removed from the input
+dataset, then it is sorted. A sliding window of @code{window_size} elements
+is parsed over the dataset and is used as a reference to identify the
+outlier. The first element where the distance to the previous (sorted)
+element is @code{sigma} units away from the distribution of distances of
+the @code{window_size} previous elements is considered an outlier and
+returned by this function.
+
+Formally, Let's take @mymath{v_i} to be the @mymath{i}-th element of the
+sorted input (with no blank values) and @mymath{m} and @mymath{\sigma} as
+the @mymath{\sigma}-clipped median and standard deviation from the
+distances of the previous @code{window_size} elements. If the value given
+to @code{sigma} is displayed with @mymath{s}, the returned outlier is the
+first (sorted) elemented where the following condition becomes true:
+
address@hidden(v_i-v_{i-1})-m\over \sigma}>s}
+
+The @code{sigclip_multip} and @code{sigclip_param} arguments specify the
+properties of the @mymath{\sigma}-clipping. You see that by this
+definition, the outlier cannot be any of the first @code{window_size}
+elements, since they constitute the window that is used for the first
+checked element.
+
+If @code{inplace!=0}, the removing of blank elements and sorting will be
+done within the input dataset's allocated space. Otherwise, this function
+will internally allocate (and later free) the necessary space to keep the
+intermediate space that this process requires.
+
address@hidden deftypefun
+
+
+
@node Binary datasets, Labeled datasets, Statistical operations, Gnuastro
library
@subsection Binary datasets (@file{binary.h})
diff --git a/lib/gnuastro/statistics.h b/lib/gnuastro/statistics.h
index ad96d01..f7860d7 100644
--- a/lib/gnuastro/statistics.h
+++ b/lib/gnuastro/statistics.h
@@ -173,7 +173,10 @@ gal_data_t *
gal_statistics_sigma_clip(gal_data_t *input, float multip, float param,
int inplace, int quiet);
-
+gal_data_t *
+gal_statistics_outlier_positive(gal_data_t *input, size_t window_size,
+ float sigma, float sigclip_multip,
+ float sigclip_param, int inplace);
diff --git a/lib/statistics.c b/lib/statistics.c
index ce77950..22ed7af 100644
--- a/lib/statistics.c
+++ b/lib/statistics.c
@@ -2070,61 +2070,85 @@ gal_statistics_sigma_clip(gal_data_t *input, float
multip, float param,
-#if 0
-/* Using the cumulative distribution function this function will
- remove outliers from a dataset. */
-void
-gal_statistics_remove_outliers_flat_cdf(float *sorted, size_t *outsize)
+/* Find the first outlier in a distribution. */
+#define OUTLIER_BYTYPE(IT) { \
+ IT *arr=nbs->array; \
+ for(i=window_size;i<nbs->size;++i) \
+ { \
+ /* Fill in the distance array. */ \
+ for(j=0; j<window_size; ++j) \
+ darr[j] = arr[i-window_size+j+1] - arr[i-window_size+j]; \
+ \
+ /* Get the sigma-clipped information. */ \
+ sclip=gal_statistics_sigma_clip(dist, sigclip_multip, \
+ sigclip_param, 0, 1); \
+ sarr=sclip->array; \
+ \
+ /* For a check */ \
+ /* printf("%f (%f, %f)\n", */ \
+ /* ((double)(arr[i]-arr[i-1])) - sarr[1], */ \
+ /* sarr[1], sigma*sarr[3]); */ \
+ \
+ /* Terminate the loop if the dist. is larger than requested. */ \
+ /* This shows we have reached the first outlier's position. */ \
+ if( (((double)(arr[i]-arr[i-1])) - sarr[1]) > sigma*sarr[3] ) \
+ { \
+ /* Allocate the output dataset. */ \
+ out=gal_data_alloc(NULL, input->type, 1, &one, NULL, 0, -1, \
+ NULL, NULL, NULL); \
+ \
+ /* Write the outlier, clean up and break. */ \
+ *(IT *)(out->array)=arr[i-1]; \
+ gal_data_free(sclip); \
+ break; \
+ } \
+ \
+ /* Clean up (if we get here). */ \
+ gal_data_free(sclip); \
+ } \
+ }
+gal_data_t *
+gal_statistics_outlier_positive(gal_data_t *input, size_t window_size,
+ float sigma, float sigclip_multip,
+ float sigclip_param, int inplace)
{
- printf("\n ... in gal_statistics_remove_outliers_flat_cdf ... \n");
- exit(1);
- int firstfound=0;
- size_t size=*outsize, i, maxind;
- float *slopes, minslope, maxslope;
-
- /* Find a slopes array, think of the cumulative frequency plot when
- you want to think about slopes. */
- errno=0; slopes=malloc(size*sizeof *slopes);
- if(slopes==NULL)
- error(EXIT_FAILURE, errno, "%s: %zu bytes for slopes",
- __func__, size*sizeof *slopes);
-
- /* Calcuate the slope of the CDF and put it in the slopes array. */
- for(i=1;i<size-1;++i)
- slopes[i]=2/(sorted[i+1]-sorted[i-1]);
-
- /* Find the position of the maximum slope, note that around the
- distribution mode, the difference between the values varies less,
- so two neighbouring elements have the closest values, hence the
- largest slope (when their difference is in the denominator). */
- gal_statistics_f_max_with_index(slopes+1, size-2, &maxslope, &maxind);
-
- /* Find the minimum slope from the second element (for the first the
- slope is not defined. NOTE; maxind is one smaller than it should
- be because the input array to find it began from the second
- element. */
- gal_statistics_float_second_min(slopes+1, maxind+1, &minslope);
-
- /* Find the second place where the slope falls below `minslope`
- after the maximum position. When found, add it with one to
- account for error. Note that incase there are no outliers by this
- definition, then the final size will be equal to size! */
- for(i=maxind+1;i<size-1;++i)
- if(slopes[i]<minslope)
- {
- if(firstfound)
- break;
- else
- firstfound=1;
- }
- *outsize=i+1;
+ float *sarr;
+ double *darr;
+ size_t i, j, one=1;
+ gal_data_t *dist, *sclip, *nbs, *out=NULL;
- /*
- for(i=0;i<size;++i)
- printf("%zu\t%.3f\t%.3f\n", i, arr[i], slopes[i]);
- printf("\n\nPlace to cut off for outliers is: %zu\n\n", *outsize);
- */
+ /* Remove all blanks and sort the dataset. */
+ nbs=gal_statistics_no_blank_sorted(input, inplace);
+
+ /* A small sanity check. */
+ if(window_size<nbs->size && window_size!=0)
+ {
+ /* Allocate space to keep the distances. */
+ dist=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &window_size, NULL,
+ 0, -1, NULL, NULL, NULL);
+ darr=dist->array;
+
+ /* Find the outlier based on the type of the input dataset. */
+ switch(input->type)
+ {
+ case GAL_TYPE_UINT8: OUTLIER_BYTYPE( uint8_t ); break;
+ case GAL_TYPE_INT8: OUTLIER_BYTYPE( int8_t ); break;
+ case GAL_TYPE_UINT16: OUTLIER_BYTYPE( uint16_t ); break;
+ case GAL_TYPE_INT16: OUTLIER_BYTYPE( int16_t ); break;
+ case GAL_TYPE_UINT32: OUTLIER_BYTYPE( uint32_t ); break;
+ case GAL_TYPE_INT32: OUTLIER_BYTYPE( int32_t ); break;
+ case GAL_TYPE_UINT64: OUTLIER_BYTYPE( uint64_t ); break;
+ case GAL_TYPE_INT64: OUTLIER_BYTYPE( int64_t ); break;
+ case GAL_TYPE_FLOAT32: OUTLIER_BYTYPE( float ); break;
+ case GAL_TYPE_FLOAT64: OUTLIER_BYTYPE( double ); break;
+ default:
+ error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
+ __func__, input->type);
+ }
+ }
- free(slopes);
+ /* Clean up and return. */
+ gal_data_free(dist);
+ if(nbs!=input) gal_data_free(nbs);
+ return out;
}
-#endif