gnuastro-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gnuastro-commits] master 05f7d4e 1/2: Statistics library function to re


From: Mohammad Akhlaghi
Subject: [gnuastro-commits] master 05f7d4e 1/2: Statistics library function to remove outliers
Date: Sun, 19 Aug 2018 19:09:50 -0400 (EDT)

branch: master
commit 05f7d4ee822d6d8b846088c234cf73b086fd25ac
Author: Mohammad Akhlaghi <address@hidden>
Commit: Mohammad Akhlaghi <address@hidden>

    Statistics library function to remove outliers
    
    A new statistics library function called `gal_statistics_outlier_positive'
    is now available in the library to help in identifying and removing
    outliers from a distribution.
---
 NEWS                      |   1 +
 doc/gnuastro.texi         |  40 ++++++++++++++
 lib/gnuastro/statistics.h |   5 +-
 lib/statistics.c          | 132 +++++++++++++++++++++++++++-------------------
 4 files changed, 123 insertions(+), 55 deletions(-)

diff --git a/NEWS b/NEWS
index f4886d1..061735f 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,7 @@ GNU Astronomy Utilities NEWS                          -*- 
outline -*-
     -- gal_fits_key_write_in_ptr: New name of gal_fits_key_write.
     -- gal_fits_key_write_version_in_ptr: new name of 
gal_fits_key_write_version.
     -- gal_fits_key_write_config: write key list and version as config header.
+    -- gal_statistics_outlier_positive: Find the first positive outlier.
 
 ** Removed features
 
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index c24683d..fa8987a 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -27191,6 +27191,46 @@ above.
 @end deftypefun
 
 
address@hidden {gal_data_t *} gal_statistics_outlier_positive (gal_data_t 
@code{*input}, size_t @code{window_size}, float @code{sigma}, float 
@code{sigclip_multip}, float @code{sigclip_param}, int @code{inplace})
+Find the first positive outlier in the @code{input} distribution. The
+returned dataset contains a single element: the first positive outlier. It
+is one of the dataset's elements, in the same type as the input. If the
+process fails for any reason (for example @code{window_size} is larger than
+the number of @code{input}'s non-blank elements, or no outlier was found),
+a @code{NULL} pointer will be returned.
+
+All (possibly existing) blank elements are first removed from the input
+dataset, then it is sorted. A sliding window of @code{window_size} elements
+is parsed over the dataset and is used as a reference to identify the
+outlier. The first element where the distance to the previous (sorted)
+element is @code{sigma} units away from the distribution of distances of
+the @code{window_size} previous elements is considered an outlier and
+returned by this function.
+
+Formally, Let's take @mymath{v_i} to be the @mymath{i}-th element of the
+sorted input (with no blank values) and @mymath{m} and @mymath{\sigma} as
+the @mymath{\sigma}-clipped median and standard deviation from the
+distances of the previous @code{window_size} elements. If the value given
+to @code{sigma} is displayed with @mymath{s}, the returned outlier is the
+first (sorted) elemented where the following condition becomes true:
+
address@hidden(v_i-v_{i-1})-m\over \sigma}>s}
+
+The @code{sigclip_multip} and @code{sigclip_param} arguments specify the
+properties of the @mymath{\sigma}-clipping. You see that by this
+definition, the outlier cannot be any of the first @code{window_size}
+elements, since they constitute the window that is used for the first
+checked element.
+
+If @code{inplace!=0}, the removing of blank elements and sorting will be
+done within the input dataset's allocated space. Otherwise, this function
+will internally allocate (and later free) the necessary space to keep the
+intermediate space that this process requires.
+
address@hidden deftypefun
+
+
+
 
 @node Binary datasets, Labeled datasets, Statistical operations, Gnuastro 
library
 @subsection Binary datasets (@file{binary.h})
diff --git a/lib/gnuastro/statistics.h b/lib/gnuastro/statistics.h
index ad96d01..f7860d7 100644
--- a/lib/gnuastro/statistics.h
+++ b/lib/gnuastro/statistics.h
@@ -173,7 +173,10 @@ gal_data_t *
 gal_statistics_sigma_clip(gal_data_t *input, float multip, float param,
                           int inplace, int quiet);
 
-
+gal_data_t *
+gal_statistics_outlier_positive(gal_data_t *input, size_t window_size,
+                                float sigma, float sigclip_multip,
+                                float sigclip_param, int inplace);
 
 
 
diff --git a/lib/statistics.c b/lib/statistics.c
index ce77950..22ed7af 100644
--- a/lib/statistics.c
+++ b/lib/statistics.c
@@ -2070,61 +2070,85 @@ gal_statistics_sigma_clip(gal_data_t *input, float 
multip, float param,
 
 
 
-#if 0
-/* Using the cumulative distribution function this function will
-   remove outliers from a dataset. */
-void
-gal_statistics_remove_outliers_flat_cdf(float *sorted, size_t *outsize)
+/* Find the first outlier in a distribution. */
+#define OUTLIER_BYTYPE(IT) {                                            \
+    IT *arr=nbs->array;                                                 \
+    for(i=window_size;i<nbs->size;++i)                                  \
+      {                                                                 \
+        /* Fill in the distance array. */                               \
+        for(j=0; j<window_size; ++j)                                    \
+          darr[j] = arr[i-window_size+j+1] - arr[i-window_size+j];      \
+                                                                        \
+        /* Get the sigma-clipped information. */                        \
+        sclip=gal_statistics_sigma_clip(dist, sigclip_multip,           \
+                                        sigclip_param, 0, 1);           \
+        sarr=sclip->array;                                              \
+                                                                        \
+        /* For a check                                   */             \
+        /* printf("%f (%f, %f)\n",                       */             \
+        /*        ((double)(arr[i]-arr[i-1])) - sarr[1], */             \
+        /*        sarr[1], sigma*sarr[3]);               */             \
+                                                                        \
+        /* Terminate the loop if the dist. is larger than requested. */ \
+        /* This shows we have reached the first outlier's position. */  \
+        if( (((double)(arr[i]-arr[i-1])) - sarr[1]) > sigma*sarr[3] )   \
+          {                                                             \
+            /* Allocate the output dataset. */                          \
+            out=gal_data_alloc(NULL, input->type, 1, &one, NULL, 0, -1, \
+                               NULL, NULL, NULL);                       \
+                                                                        \
+            /* Write the outlier, clean up and break. */                \
+            *(IT *)(out->array)=arr[i-1];                               \
+            gal_data_free(sclip);                                       \
+            break;                                                      \
+          }                                                             \
+                                                                        \
+        /* Clean up (if we get here). */                                \
+        gal_data_free(sclip);                                           \
+      }                                                                 \
+  }
+gal_data_t *
+gal_statistics_outlier_positive(gal_data_t *input, size_t window_size,
+                                float sigma, float sigclip_multip,
+                                float sigclip_param, int inplace)
 {
-  printf("\n ... in gal_statistics_remove_outliers_flat_cdf ... \n");
-  exit(1);
-  int firstfound=0;
-  size_t size=*outsize, i, maxind;
-  float *slopes, minslope, maxslope;
-
-  /* Find a slopes array, think of the cumulative frequency plot when
-     you want to think about slopes. */
-  errno=0; slopes=malloc(size*sizeof *slopes);
-  if(slopes==NULL)
-    error(EXIT_FAILURE, errno, "%s: %zu bytes for slopes",
-          __func__, size*sizeof *slopes);
-
-  /* Calcuate the slope of the CDF and put it in the slopes array. */
-  for(i=1;i<size-1;++i)
-    slopes[i]=2/(sorted[i+1]-sorted[i-1]);
-
-  /* Find the position of the maximum slope, note that around the
-     distribution mode, the difference between the values varies less,
-     so two neighbouring elements have the closest values, hence the
-     largest slope (when their difference is in the denominator). */
-  gal_statistics_f_max_with_index(slopes+1, size-2, &maxslope, &maxind);
-
-  /* Find the minimum slope from the second element (for the first the
-     slope is not defined. NOTE; maxind is one smaller than it should
-     be because the input array to find it began from the second
-     element. */
-  gal_statistics_float_second_min(slopes+1, maxind+1, &minslope);
-
-  /* Find the second place where the slope falls below `minslope`
-     after the maximum position. When found, add it with one to
-     account for error. Note that incase there are no outliers by this
-     definition, then the final size will be equal to size! */
-  for(i=maxind+1;i<size-1;++i)
-    if(slopes[i]<minslope)
-      {
-        if(firstfound)
-          break;
-        else
-          firstfound=1;
-      }
-  *outsize=i+1;
+  float *sarr;
+  double *darr;
+  size_t i, j, one=1;
+  gal_data_t *dist, *sclip, *nbs, *out=NULL;
 
-  /*
-  for(i=0;i<size;++i)
-    printf("%zu\t%.3f\t%.3f\n", i, arr[i], slopes[i]);
-  printf("\n\nPlace to cut off for outliers is: %zu\n\n", *outsize);
-  */
+  /* Remove all blanks and sort the dataset. */
+  nbs=gal_statistics_no_blank_sorted(input, inplace);
+
+  /* A small sanity check. */
+  if(window_size<nbs->size && window_size!=0)
+    {
+      /* Allocate space to keep the distances. */
+      dist=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &window_size, NULL,
+                          0, -1, NULL, NULL, NULL);
+      darr=dist->array;
+
+      /* Find the outlier based on the type of the input dataset. */
+      switch(input->type)
+        {
+        case GAL_TYPE_UINT8:     OUTLIER_BYTYPE( uint8_t  );   break;
+        case GAL_TYPE_INT8:      OUTLIER_BYTYPE( int8_t   );   break;
+        case GAL_TYPE_UINT16:    OUTLIER_BYTYPE( uint16_t );   break;
+        case GAL_TYPE_INT16:     OUTLIER_BYTYPE( int16_t  );   break;
+        case GAL_TYPE_UINT32:    OUTLIER_BYTYPE( uint32_t );   break;
+        case GAL_TYPE_INT32:     OUTLIER_BYTYPE( int32_t  );   break;
+        case GAL_TYPE_UINT64:    OUTLIER_BYTYPE( uint64_t );   break;
+        case GAL_TYPE_INT64:     OUTLIER_BYTYPE( int64_t  );   break;
+        case GAL_TYPE_FLOAT32:   OUTLIER_BYTYPE( float    );   break;
+        case GAL_TYPE_FLOAT64:   OUTLIER_BYTYPE( double   );   break;
+        default:
+          error(EXIT_FAILURE, 0, "%s: type code %d not recognized",
+                __func__, input->type);
+        }
+    }
 
-  free(slopes);
+  /* Clean up and return. */
+  gal_data_free(dist);
+  if(nbs!=input) gal_data_free(nbs);
+  return out;
 }
-#endif



reply via email to

[Prev in Thread] Current Thread [Next in Thread]