1 | /* multirobust.c |
2 | * |
3 | * Copyright (C) 2013 Patrick Alken |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 3 of the License, or (at |
8 | * your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, but |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License |
16 | * along with this program; if not, write to the Free Software |
17 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * |
19 | * This module contains routines related to robust linear least squares. The |
20 | * algorithm used closely follows the publications: |
21 | * |
22 | * [1] DuMouchel, W. and F. O'Brien (1989), "Integrating a robust |
23 | * option into a multiple regression computing environment," |
24 | * Computer Science and Statistics: Proceedings of the 21st |
25 | * Symposium on the Interface, American Statistical Association |
26 | * |
27 | * [2] Street, J.O., R.J. Carroll, and D. Ruppert (1988), "A note on |
28 | * computing robust regression estimates via iteratively |
29 | * reweighted least squares," The American Statistician, v. 42, |
30 | * pp. 152-154. |
31 | */ |
32 | |
33 | #include <config.h> |
34 | #include <gsl/gsl_math.h> |
35 | #include <gsl/gsl_multifit.h> |
36 | #include <gsl/gsl_vector.h> |
37 | #include <gsl/gsl_matrix.h> |
38 | #include <gsl/gsl_statistics.h> |
39 | #include <gsl/gsl_sort.h> |
40 | #include <gsl/gsl_sort_vector.h> |
41 | #include <gsl/gsl_blas.h> |
42 | #include <gsl/gsl_linalg.h> |
43 | |
44 | static int robust_test_convergence(const gsl_vector *c_prev, const gsl_vector *c, |
45 | const double tol); |
46 | static double robust_madsigma(const gsl_vector *r, const size_t p, gsl_vector *workn); |
47 | static double robust_robsigma(const gsl_vector *r, const double s, |
48 | const double tune, gsl_multifit_robust_workspace *w); |
49 | static double robust_sigma(const double s_ols, const double s_rob, |
50 | gsl_multifit_robust_workspace *w); |
51 | static int robust_covariance(const double sigma, gsl_matrix *cov, |
52 | gsl_multifit_robust_workspace *w); |
53 | |
54 | /* |
55 | gsl_multifit_robust_alloc |
56 | Allocate a robust workspace |
57 | |
58 | Inputs: T - robust weighting algorithm |
59 | n - number of observations |
60 | p - number of model parameters |
61 | |
62 | Return: pointer to workspace |
63 | */ |
64 | |
65 | gsl_multifit_robust_workspace * |
66 | gsl_multifit_robust_alloc(const gsl_multifit_robust_type *T, |
| 1Start Analysis. |
67 | const size_t n, const size_t p) |
68 | { |
69 | gsl_multifit_robust_workspace *w; |
70 | |
71 | if (n < p) |
| 2Take the false branch.    n < p |
72 | { |
73 | GSL_ERROR_VAL("observations n must be >= p", GSL_EINVAL, 0); |
74 | } |
75 | |
76 | w = calloc(1, sizeof(gsl_multifit_robust_workspace)); |
| 3Call a function.    calloc(1, sizeof(gsl_multifit_robust_workspace)) |
| 47Memory allocated in heap space is not freed.    calloc(1, sizeof(gsl_multifit_robust_workspace)) |
77 | if (w == 0) |
| 4Take the false branch.    w == 0 |
78 | { |
79 | GSL_ERROR_VAL("failed to allocate space for multifit_robust struct", |
80 | GSL_ENOMEM, 0); |
81 | } |
82 | |
83 | w->n = n; |
84 | w->p = p; |
85 | w->type = T; |
86 | w->maxiter = 100; /* maximum iterations */ |
87 | w->tune = w->type->tuning_default; |
88 | |
89 | w->multifit_p = gsl_multifit_linear_alloc(n, p); |
| 5Call a function.    gsl_multifit_linear_alloc(n, p) |
90 | if (w->multifit_p == 0) |
| 24Take the false branch.    w->multifit_p == 0 |
91 | { |
92 | GSL_ERROR_VAL("failed to allocate space for multifit_linear struct", |
93 | GSL_ENOMEM, 0); |
94 | } |
95 | |
96 | w->r = gsl_vector_alloc(n); |
| 25Call a function.    gsl_vector_alloc(n) |
97 | if (w->r == 0) |
| 26Take the false branch.    w->r == 0 |
98 | { |
99 | GSL_ERROR_VAL("failed to allocate space for residuals", |
100 | GSL_ENOMEM, 0); |
101 | } |
102 | |
103 | w->weights = gsl_vector_alloc(n); |
| 27Call a function.    gsl_vector_alloc(n) |
104 | if (w->weights == 0) |
| 28Take the false branch.    w->weights == 0 |
105 | { |
106 | GSL_ERROR_VAL("failed to allocate space for weights", GSL_ENOMEM, 0); |
107 | } |
108 | |
109 | w->c_prev = gsl_vector_alloc(p); |
| 29Call a function.    gsl_vector_alloc(p) |
110 | if (w->c_prev == 0) |
| 30Take the false branch.    w->c_prev == 0 |
111 | { |
112 | GSL_ERROR_VAL("failed to allocate space for c_prev", GSL_ENOMEM, 0); |
113 | } |
114 | |
115 | w->resfac = gsl_vector_alloc(n); |
| 31Call a function.    gsl_vector_alloc(n) |
116 | if (w->resfac == 0) |
| 32Take the false branch.    w->resfac == 0 |
117 | { |
118 | GSL_ERROR_VAL("failed to allocate space for residual factors", |
119 | GSL_ENOMEM, 0); |
120 | } |
121 | |
122 | w->psi = gsl_vector_alloc(n); |
| 33Call a function.    gsl_vector_alloc(n) |
123 | if (w->psi == 0) |
| 34Take the false branch.    w->psi == 0 |
124 | { |
125 | GSL_ERROR_VAL("failed to allocate space for psi", GSL_ENOMEM, 0); |
126 | } |
127 | |
128 | w->dpsi = gsl_vector_alloc(n); |
| 35Call a function.    gsl_vector_alloc(n) |
129 | if (w->dpsi == 0) |
| 36Take the false branch.    w->dpsi == 0 |
130 | { |
131 | GSL_ERROR_VAL("failed to allocate space for dpsi", GSL_ENOMEM, 0); |
132 | } |
133 | |
134 | w->QSI = gsl_matrix_alloc(p, p); |
| 37Call a function.    gsl_matrix_alloc(p, p) |
135 | if (w->QSI == 0) |
| 38Take the false branch.    w->QSI == 0 |
136 | { |
137 | GSL_ERROR_VAL("failed to allocate space for QSI", GSL_ENOMEM, 0); |
138 | } |
139 | |
140 | w->D = gsl_vector_alloc(p); |
| 39Call a function.    gsl_vector_alloc(p) |
141 | if (w->D == 0) |
| 40Take the false branch.    w->D == 0 |
142 | { |
143 | GSL_ERROR_VAL("failed to allocate space for D", GSL_ENOMEM, 0); |
144 | } |
145 | |
146 | w->workn = gsl_vector_alloc(n); |
| 41Call a function.    gsl_vector_alloc(n) |
147 | if (w->workn == 0) |
| 42Take the true branch.    w->workn == 0 |
148 | { |
149 | GSL_ERROR_VAL("failed to allocate space for workn", GSL_ENOMEM, 0); |
| 43Call a function.    gsl_error("failed to allocate space for workn", "/home/xingming/workplace/experiment/fp/gsl-2.1/multifit/multirobust.c", 149, GSL_ENOMEM) |
150 | } |
151 | |
152 | w->stats.sigma_ols = 0.0; |
153 | w->stats.sigma_mad = 0.0; |
154 | w->stats.sigma_rob = 0.0; |
155 | w->stats.sigma = 0.0; |
156 | w->stats.Rsq = 0.0; |
157 | w->stats.adj_Rsq = 0.0; |
158 | w->stats.rmse = 0.0; |
159 | w->stats.sse = 0.0; |
160 | w->stats.dof = n - p; |
161 | w->stats.weights = w->weights; |
162 | w->stats.r = w->r; |
163 | |
164 | return w; |
165 | } /* gsl_multifit_robust_alloc() */ |
166 | |
167 | /* |
168 | gsl_multifit_robust_free() |
169 | Free memory associated with robust workspace |
170 | */ |
171 | |
172 | void |
173 | gsl_multifit_robust_free(gsl_multifit_robust_workspace *w) |
174 | { |
175 | if (w->multifit_p) |
176 | gsl_multifit_linear_free(w->multifit_p); |
177 | |
178 | if (w->r) |
179 | gsl_vector_free(w->r); |
180 | |
181 | if (w->weights) |
182 | gsl_vector_free(w->weights); |
183 | |
184 | if (w->c_prev) |
185 | gsl_vector_free(w->c_prev); |
186 | |
187 | if (w->resfac) |
188 | gsl_vector_free(w->resfac); |
189 | |
190 | if (w->psi) |
191 | gsl_vector_free(w->psi); |
192 | |
193 | if (w->dpsi) |
194 | gsl_vector_free(w->dpsi); |
195 | |
196 | if (w->QSI) |
197 | gsl_matrix_free(w->QSI); |
198 | |
199 | if (w->D) |
200 | gsl_vector_free(w->D); |
201 | |
202 | if (w->workn) |
203 | gsl_vector_free(w->workn); |
204 | |
205 | free(w); |
206 | } /* gsl_multifit_robust_free() */ |
207 | |
208 | int |
209 | gsl_multifit_robust_tune(const double tune, gsl_multifit_robust_workspace *w) |
210 | { |
211 | w->tune = tune; |
212 | return GSL_SUCCESS; |
213 | } |
214 | |
215 | int |
216 | gsl_multifit_robust_maxiter(const size_t maxiter, |
217 | gsl_multifit_robust_workspace *w) |
218 | { |
219 | if (w->maxiter == 0) |
220 | { |
221 | GSL_ERROR("maxiter must be greater than 0", GSL_EINVAL); |
222 | } |
223 | else |
224 | { |
225 | w->maxiter = maxiter; |
226 | return GSL_SUCCESS; |
227 | } |
228 | } |
229 | |
230 | const char * |
231 | gsl_multifit_robust_name(const gsl_multifit_robust_workspace *w) |
232 | { |
233 | return w->type->name; |
234 | } |
235 | |
236 | gsl_multifit_robust_stats |
237 | gsl_multifit_robust_statistics(const gsl_multifit_robust_workspace *w) |
238 | { |
239 | return w->stats; |
240 | } |
241 | |
242 | /* |
243 | gsl_multifit_robust_weights() |
244 | Compute iterative weights for given residuals |
245 | |
246 | Inputs: r - residuals |
247 | wts - (output) where to store weights |
248 | w_i = r_i / (sigma_mad * tune) |
249 | w - workspace |
250 | |
251 | Return: success/error |
252 | |
253 | Notes: |
254 | 1) Sizes of r and wts must be equal |
255 | 2) Size of r/wts may be less than or equal to w->n, to allow |
256 | for computing weights of a subset of data |
257 | */ |
258 | |
259 | int |
260 | gsl_multifit_robust_weights(const gsl_vector *r, gsl_vector *wts, |
261 | gsl_multifit_robust_workspace *w) |
262 | { |
263 | if (r->size != wts->size) |
264 | { |
265 | GSL_ERROR("residual vector does not match weight vector size", GSL_EBADLEN); |
266 | } |
267 | else |
268 | { |
269 | int s; |
270 | double sigma; |
271 | |
272 | sigma = robust_madsigma(r, w->p, wts); |
273 | |
274 | /* scale residuals by sigma and tuning factor */ |
275 | gsl_vector_memcpy(wts, r); |
276 | gsl_vector_scale(wts, 1.0 / (sigma * w->tune)); |
277 | |
278 | /* compute weights in-place */ |
279 | s = w->type->wfun(wts, wts); |
280 | |
281 | return s; |
282 | } |
283 | } /* gsl_multifit_robust_weights() */ |
284 | |
285 | /* |
286 | gsl_multifit_robust() |
287 | Perform robust iteratively reweighted linear least squares |
288 | fit |
289 | |
290 | Inputs: X - design matrix of basis functions |
291 | y - right hand side vector |
292 | c - (output) model coefficients |
293 | cov - (output) covariance matrix |
294 | w - workspace |
295 | */ |
296 | |
297 | int |
298 | gsl_multifit_robust(const gsl_matrix * X, |
299 | const gsl_vector * y, |
300 | gsl_vector * c, |
301 | gsl_matrix * cov, |
302 | gsl_multifit_robust_workspace *w) |
303 | { |
304 | /* check matrix and vector sizes */ |
305 | if (X->size1 != y->size) |
306 | { |
307 | GSL_ERROR |
308 | ("number of observations in y does not match rows of matrix X", |
309 | GSL_EBADLEN); |
310 | } |
311 | else if (X->size2 != c->size) |
312 | { |
313 | GSL_ERROR ("number of parameters c does not match columns of matrix X", |
314 | GSL_EBADLEN); |
315 | } |
316 | else if (cov->size1 != cov->size2) |
317 | { |
318 | GSL_ERROR ("covariance matrix is not square", GSL_ENOTSQR); |
319 | } |
320 | else if (c->size != cov->size1) |
321 | { |
322 | GSL_ERROR |
323 | ("number of parameters does not match size of covariance matrix", |
324 | GSL_EBADLEN); |
325 | } |
326 | else if (X->size1 != w->n || X->size2 != w->p) |
327 | { |
328 | GSL_ERROR |
329 | ("size of workspace does not match size of observation matrix", |
330 | GSL_EBADLEN); |
331 | } |
332 | else |
333 | { |
334 | int s; |
335 | double chisq; |
336 | const double tol = GSL_SQRT_DBL_EPSILON; |
337 | int converged = 0; |
338 | size_t numit = 0; |
339 | const size_t n = y->size; |
340 | double sigy = gsl_stats_sd(y->data, y->stride, n); |
341 | double sig_lower; |
342 | size_t i; |
343 | |
344 | /* |
345 | * if the initial fit is very good, then finding outliers by comparing |
346 | * them to the residual standard deviation is difficult. Therefore we |
347 | * set a lower bound on the standard deviation estimate that is a small |
348 | * fraction of the standard deviation of the data values |
349 | */ |
350 | sig_lower = 1.0e-6 * sigy; |
351 | if (sig_lower == 0.0) |
352 | sig_lower = 1.0; |
353 | |
354 | /* compute initial estimates using ordinary least squares */ |
355 | s = gsl_multifit_linear(X, y, c, cov, &chisq, w->multifit_p); |
356 | if (s) |
357 | return s; |
358 | |
359 | /* save Q S^{-1} of original matrix */ |
360 | gsl_matrix_memcpy(w->QSI, w->multifit_p->QSI); |
361 | gsl_vector_memcpy(w->D, w->multifit_p->D); |
362 | |
363 | /* compute statistical leverage of each data point */ |
364 | s = gsl_linalg_SV_leverage(w->multifit_p->A, w->resfac); |
365 | if (s) |
366 | return s; |
367 | |
368 | /* correct residuals with factor 1 / sqrt(1 - h) */ |
369 | for (i = 0; i < n; ++i) |
370 | { |
371 | double h = gsl_vector_get(w->resfac, i); |
372 | |
373 | if (h > 0.9999) |
374 | h = 0.9999; |
375 | |
376 | gsl_vector_set(w->resfac, i, 1.0 / sqrt(1.0 - h)); |
377 | } |
378 | |
379 | /* compute residuals from OLS fit r = y - X c */ |
380 | s = gsl_multifit_linear_residuals(X, y, c, w->r); |
381 | if (s) |
382 | return s; |
383 | |
384 | /* compute estimate of sigma from ordinary least squares */ |
385 | w->stats.sigma_ols = gsl_blas_dnrm2(w->r) / sqrt((double) w->stats.dof); |
386 | |
387 | while (!converged && ++numit <= w->maxiter) |
388 | { |
389 | double sig; |
390 | |
391 | /* adjust residuals by statistical leverage (see DuMouchel and O'Brien) */ |
392 | s = gsl_vector_mul(w->r, w->resfac); |
393 | if (s) |
394 | return s; |
395 | |
396 | /* compute estimate of standard deviation using MAD */ |
397 | sig = robust_madsigma(w->r, w->p, w->workn); |
398 | |
399 | /* scale residuals by standard deviation and tuning parameter */ |
400 | gsl_vector_scale(w->r, 1.0 / (GSL_MAX(sig, sig_lower) * w->tune)); |
401 | |
402 | /* compute weights using these residuals */ |
403 | s = w->type->wfun(w->r, w->weights); |
404 | if (s) |
405 | return s; |
406 | |
407 | gsl_vector_memcpy(w->c_prev, c); |
408 | |
409 | /* solve weighted least squares with new weights */ |
410 | s = gsl_multifit_wlinear(X, w->weights, y, c, cov, &chisq, w->multifit_p); |
411 | if (s) |
412 | return s; |
413 | |
414 | /* compute new residuals r = y - X c */ |
415 | s = gsl_multifit_linear_residuals(X, y, c, w->r); |
416 | if (s) |
417 | return s; |
418 | |
419 | converged = robust_test_convergence(w->c_prev, c, tol); |
420 | } |
421 | |
422 | /* compute final MAD sigma */ |
423 | w->stats.sigma_mad = robust_madsigma(w->r, w->p, w->workn); |
424 | |
425 | /* compute robust estimate of sigma */ |
426 | w->stats.sigma_rob = robust_robsigma(w->r, w->stats.sigma_mad, w->tune, w); |
427 | |
428 | /* compute final estimate of sigma */ |
429 | w->stats.sigma = robust_sigma(w->stats.sigma_ols, w->stats.sigma_rob, w); |
430 | |
431 | /* store number of iterations */ |
432 | w->stats.numit = numit; |
433 | |
434 | { |
435 | double dof = (double) w->stats.dof; |
436 | double rnorm = w->stats.sigma * sqrt(dof); /* see DuMouchel, sec 4.2 */ |
437 | double ss_err = rnorm * rnorm; |
438 | double ss_tot = gsl_stats_tss(y->data, y->stride, n); |
439 | |
440 | /* compute R^2 */ |
441 | w->stats.Rsq = 1.0 - ss_err / ss_tot; |
442 | |
443 | /* compute adjusted R^2 */ |
444 | w->stats.adj_Rsq = 1.0 - (1.0 - w->stats.Rsq) * ((double)n - 1.0) / dof; |
445 | |
446 | /* compute rmse */ |
447 | w->stats.rmse = sqrt(ss_err / dof); |
448 | |
449 | /* store SSE */ |
450 | w->stats.sse = ss_err; |
451 | } |
452 | |
453 | /* calculate covariance matrix = sigma^2 (X^T X)^{-1} */ |
454 | s = robust_covariance(w->stats.sigma, cov, w); |
455 | if (s) |
456 | return s; |
457 | |
458 | /* raise an error if not converged */ |
459 | if (numit > w->maxiter) |
460 | { |
461 | GSL_ERROR("maximum iterations exceeded", GSL_EMAXITER); |
462 | } |
463 | |
464 | return s; |
465 | } |
466 | } /* gsl_multifit_robust() */ |
467 | |
468 | /* Estimation of values for given x */ |
469 | int |
470 | gsl_multifit_robust_est(const gsl_vector * x, const gsl_vector * c, |
471 | const gsl_matrix * cov, double *y, double *y_err) |
472 | { |
473 | int s = gsl_multifit_linear_est(x, c, cov, y, y_err); |
474 | |
475 | return s; |
476 | } |
477 | |
478 | /* |
479 | gsl_multifit_robust_residuals() |
480 | Compute robust / studentized residuals from fit |
481 | |
482 | r_i = (y_i - Y_i) / (sigma * sqrt(1 - h_i)) |
483 | |
484 | Inputs: X - design matrix |
485 | y - rhs vector |
486 | c - fit coefficients |
487 | r - (output) studentized residuals |
488 | w - workspace |
489 | |
490 | Notes: |
491 | 1) gsl_multifit_robust() must first be called to compute the coefficients |
492 | c, the leverage factors in w->resfac, and sigma in w->stats.sigma |
493 | */ |
494 | |
495 | int |
496 | gsl_multifit_robust_residuals(const gsl_matrix * X, const gsl_vector * y, |
497 | const gsl_vector * c, gsl_vector * r, |
498 | gsl_multifit_robust_workspace * w) |
499 | { |
500 | if (X->size1 != y->size) |
501 | { |
502 | GSL_ERROR |
503 | ("number of observations in y does not match rows of matrix X", |
504 | GSL_EBADLEN); |
505 | } |
506 | else if (X->size2 != c->size) |
507 | { |
508 | GSL_ERROR ("number of parameters c does not match columns of matrix X", |
509 | GSL_EBADLEN); |
510 | } |
511 | else if (y->size != r->size) |
512 | { |
513 | GSL_ERROR ("number of observations in y does not match number of residuals", |
514 | GSL_EBADLEN); |
515 | } |
516 | else |
517 | { |
518 | const double sigma = w->stats.sigma; /* previously calculated sigma */ |
519 | int s; |
520 | size_t i; |
521 | |
522 | /* compute r = y - X c */ |
523 | s = gsl_multifit_linear_residuals(X, y, c, r); |
524 | if (s) |
525 | return s; |
526 | |
527 | for (i = 0; i < r->size; ++i) |
528 | { |
529 | double hfac = gsl_vector_get(w->resfac, i); /* 1/sqrt(1 - h_i) */ |
530 | double *ri = gsl_vector_ptr(r, i); |
531 | |
532 | /* multiply residual by 1 / (sigma * sqrt(1 - h_i)) */ |
533 | *ri *= hfac / sigma; |
534 | } |
535 | |
536 | return s; |
537 | } |
538 | } /* gsl_multifit_robust_residuals() */ |
539 | |
540 | /*********************************** |
541 | * INTERNAL ROUTINES * |
542 | ***********************************/ |
543 | |
544 | /* |
545 | robust_test_convergence() |
546 | Test for convergence in robust least squares |
547 | |
548 | Convergence criteria: |
549 | |
550 | |c_i^(k) - c_i^(k-1)| <= tol * max(|c_i^(k)|, |c_i^(k-1)|) |
551 | |
552 | for all i. k refers to iteration number. |
553 | |
554 | Inputs: c_prev - coefficients from previous iteration |
555 | c - coefficients from current iteration |
556 | tol - tolerance |
557 | |
558 | Return: 1 if converged, 0 if not |
559 | */ |
560 | |
561 | static int |
562 | robust_test_convergence(const gsl_vector *c_prev, const gsl_vector *c, |
563 | const double tol) |
564 | { |
565 | size_t p = c->size; |
566 | size_t i; |
567 | |
568 | for (i = 0; i < p; ++i) |
569 | { |
570 | double ai = gsl_vector_get(c_prev, i); |
571 | double bi = gsl_vector_get(c, i); |
572 | |
573 | if (fabs(bi - ai) > tol * GSL_MAX(fabs(ai), fabs(bi))) |
574 | return 0; /* not yet converged */ |
575 | } |
576 | |
577 | /* converged */ |
578 | return 1; |
579 | } /* robust_test_convergence() */ |
580 | |
581 | /* |
582 | robust_madsigma() |
583 | Estimate the standard deviation of the residuals using |
584 | the Median-Absolute-Deviation (MAD) of the residuals, |
585 | throwing away the smallest p residuals. |
586 | |
587 | See: Street et al, 1988 |
588 | |
589 | Inputs: r - vector of residuals |
590 | p - number of model coefficients (smallest p residuals are |
591 | ignored) |
592 | workn - workspace of size n = length(r) |
593 | */ |
594 | |
595 | static double |
596 | robust_madsigma(const gsl_vector *r, const size_t p, gsl_vector *workn) |
597 | { |
598 | size_t n = r->size; |
599 | double sigma; |
600 | size_t i; |
601 | |
602 | /* allow for the possibility that r->size < w->n */ |
603 | gsl_vector_view v1 = gsl_vector_subvector(workn, 0, n); |
604 | gsl_vector_view v2; |
605 | |
606 | /* copy |r| into workn */ |
607 | for (i = 0; i < n; ++i) |
608 | { |
609 | gsl_vector_set(&v1.vector, i, fabs(gsl_vector_get(r, i))); |
610 | } |
611 | |
612 | gsl_sort_vector(&v1.vector); |
613 | |
614 | /* |
615 | * ignore the smallest p residuals when computing the median |
616 | * (see Street et al 1988) |
617 | */ |
618 | v2 = gsl_vector_subvector(&v1.vector, p - 1, n - p + 1); |
619 | sigma = gsl_stats_median_from_sorted_data(v2.vector.data, v2.vector.stride, v2.vector.size) / 0.6745; |
620 | |
621 | return sigma; |
622 | } /* robust_madsigma() */ |
623 | |
624 | /* |
625 | robust_robsigma() |
626 | Compute robust estimate of sigma so that |
627 | sigma^2 * inv(X' * X) is a reasonable estimate of |
628 | the covariance for robust regression. Based heavily |
629 | on the equations of Street et al, 1988. |
630 | |
631 | Inputs: r - vector of residuals y - X c |
632 | s - sigma estimate using MAD |
633 | tune - tuning constant |
634 | w - workspace |
635 | */ |
636 | |
637 | static double |
638 | robust_robsigma(const gsl_vector *r, const double s, |
639 | const double tune, gsl_multifit_robust_workspace *w) |
640 | { |
641 | double sigma; |
642 | size_t i; |
643 | const size_t n = w->n; |
644 | const size_t p = w->p; |
645 | const double st = s * tune; |
646 | double a, b, lambda; |
647 | |
648 | /* compute u = r / sqrt(1 - h) / st */ |
649 | gsl_vector_memcpy(w->workn, r); |
650 | gsl_vector_mul(w->workn, w->resfac); |
651 | gsl_vector_scale(w->workn, 1.0 / st); |
652 | |
653 | /* compute w(u) and psi'(u) */ |
654 | w->type->wfun(w->workn, w->psi); |
655 | w->type->psi_deriv(w->workn, w->dpsi); |
656 | |
657 | /* compute psi(u) = u*w(u) */ |
658 | gsl_vector_mul(w->psi, w->workn); |
659 | |
660 | /* Street et al, Eq (3) */ |
661 | a = gsl_stats_mean(w->dpsi->data, w->dpsi->stride, n); |
662 | |
663 | /* Street et al, Eq (5) */ |
664 | b = 0.0; |
665 | for (i = 0; i < n; ++i) |
666 | { |
667 | double psi_i = gsl_vector_get(w->psi, i); |
668 | double resfac = gsl_vector_get(w->resfac, i); |
669 | double fac = 1.0 / (resfac*resfac); /* 1 - h */ |
670 | |
671 | b += fac * psi_i * psi_i; |
672 | } |
673 | b /= (double) (n - p); |
674 | |
675 | /* Street et al, Eq (5) */ |
676 | lambda = 1.0 + ((double)p)/((double)n) * (1.0 - a) / a; |
677 | |
678 | sigma = lambda * sqrt(b) * st / a; |
679 | |
680 | return sigma; |
681 | } /* robust_robsigma() */ |
682 | |
683 | /* |
684 | robust_sigma() |
685 | Compute final estimate of residual standard deviation, using |
686 | the OLS and robust sigma estimates. |
687 | |
688 | This equation is taken from DuMouchel and O'Brien, sec 4.1: |
689 | \hat{\sigma_R} |
690 | |
691 | Inputs: s_ols - OLS sigma |
692 | s_rob - robust sigma |
693 | w - workspace |
694 | |
695 | Return: final estimate of sigma |
696 | */ |
697 | |
698 | static double |
699 | robust_sigma(const double s_ols, const double s_rob, |
700 | gsl_multifit_robust_workspace *w) |
701 | { |
702 | double sigma; |
703 | const double p = (double) w->p; |
704 | const double n = (double) w->n; |
705 | |
706 | /* see DuMouchel and O'Brien, sec 4.1 */ |
707 | sigma = GSL_MAX(s_rob, |
708 | sqrt((s_ols*s_ols*p*p + s_rob*s_rob*n) / |
709 | (p*p + n))); |
710 | |
711 | return sigma; |
712 | } /* robust_sigma() */ |
713 | |
714 | /* |
715 | robust_covariance() |
716 | Calculate final covariance matrix, defined as: |
717 | |
718 | sigma * (X^T X)^{-1} |
719 | |
720 | Inputs: sigma - residual standard deviation |
721 | cov - (output) covariance matrix |
722 | w - workspace |
723 | */ |
724 | |
725 | static int |
726 | robust_covariance(const double sigma, gsl_matrix *cov, |
727 | gsl_multifit_robust_workspace *w) |
728 | { |
729 | int status = 0; |
730 | const size_t p = w->p; |
731 | const double s2 = sigma * sigma; |
732 | size_t i, j; |
733 | gsl_matrix *QSI = w->QSI; |
734 | gsl_vector *D = w->D; |
735 | |
736 | /* Form variance-covariance matrix cov = s2 * (Q S^-1) (Q S^-1)^T */ |
737 | |
738 | for (i = 0; i < p; i++) |
739 | { |
740 | gsl_vector_view row_i = gsl_matrix_row (QSI, i); |
741 | double d_i = gsl_vector_get (D, i); |
742 | |
743 | for (j = i; j < p; j++) |
744 | { |
745 | gsl_vector_view row_j = gsl_matrix_row (QSI, j); |
746 | double d_j = gsl_vector_get (D, j); |
747 | double s; |
748 | |
749 | gsl_blas_ddot (&row_i.vector, &row_j.vector, &s); |
750 | |
751 | gsl_matrix_set (cov, i, j, s * s2 / (d_i * d_j)); |
752 | gsl_matrix_set (cov, j, i, s * s2 / (d_i * d_j)); |
753 | } |
754 | } |
755 | |
756 | return status; |
757 | } /* robust_covariance() */ |
758 | |