diff options
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | arith.c | 51 | ||||
-rw-r--r-- | arith.h | 2 | ||||
-rw-r--r-- | psquare.c | 169 | ||||
-rw-r--r-- | psquare.h | 54 | ||||
-rw-r--r-- | stdlib/doc-syms.tl | 1 | ||||
-rw-r--r-- | tests/016/arith.tl | 31 | ||||
-rw-r--r-- | txr.1 | 143 |
8 files changed, 452 insertions, 1 deletions
@@ -55,7 +55,7 @@ OBJS := txr.o lex.yy.o y.tab.o match.o lib.o regex.o gc.o unwind.o stream.o OBJS += arith.o hash.o utf8.o filter.o eval.o parser.o rand.o combi.o sysif.o OBJS += args.o lisplib.o cadr.o struct.o itypes.o buf.o jmp.o protsym.o ffi.o OBJS += strudel.o vm.o chksum.o chksums/sha256.o chksums/crc32.o chksums/md5.o -OBJS += tree.o time.o +OBJS += tree.o time.o psquare.o OBJS += linenoise/linenoise.o OBJS-$(debug_support) += debug.o OBJS-$(have_syslog) += syslog.o @@ -50,6 +50,7 @@ #include "itypes.h" #include "struct.h" #include "txr.h" +#include "psquare.h" #include "arith.h" #define max(a, b) ((a) > (b) ? (a) : (b)) @@ -4507,6 +4508,55 @@ val lcmv(struct args *nlist) return nary_op(lit("lcm"), lcm, abso_self, nlist, zero); } +static struct cobj_ops psq_ops = cobj_ops_init(cobj_equal_handle_op, + cptr_print_op, + cobj_destroy_free_op, + cobj_mark_op, + cobj_handle_hash_op); + +static val quant_fun(val psqo, struct args *args) +{ + val self = lit("quantile"); + cnum idx = 0; + struct psquare *psq = coerce(struct psquare *, psqo->cp.handle); + + while (args_more(args, idx)) { + val arg = args_get(args, &idx); + if (numberp(arg)) { + double s = c_flo(to_float(self, arg), self); + psq_add_sample(psq, s, self); + } else { + seq_iter_t item_iter; + val sample; + seq_iter_init(self, &item_iter, arg); + + while (seq_get(&item_iter, &sample)) { + double s = c_flo(to_float(self, sample), self); + psq_add_sample(psq, s, self); + } + } + } + + return flo(psq_get_estimate(psq)); +} + +val quantile(val pv, val grsize_in, val rate_in) +{ + val self = lit("quantile"); + double p = c_flo(to_float(self, pv), self); + struct psquare *psq = coerce(struct psquare *, chk_malloc(sizeof *psq)); + val psqo = cptr_typed(coerce(mem_t *, psq), nil, &psq_ops); + if (missingp(grsize_in)) { + psq_init(psq, p); + } else { + ucnum grsize = c_unum(grsize_in, self); + double rate = if3(missingp(rate_in), + 0.999, + c_flo(to_float(self, rate_in), self)); + psq_init_grouped(psq, p, grsize, rate); + } + return func_f0v(psqo, quant_fun); +} void arith_init(void) { @@ -4707,6 +4757,7 @@ void arith_init(void) reg_fun(intern(lit("b/"), system_package), func_n2(divi)); reg_fun(neg_s, func_n1(neg)); + reg_fun(intern(lit("quantile"), user_package), func_n3o(quantile, 1)); #if HAVE_ROUNDING_CTL_H reg_varl(intern(lit("flo-near"), user_package), num(FE_TONEAREST)); reg_varl(intern(lit("flo-down"), user_package), num(FE_DOWNWARD)); @@ -61,6 +61,8 @@ val digits(val n, val base); val poly(val x, val seq); val rpoly(val x, val seq); +val quantile(val pv, val chsize_in, val rate_in); + NORETURN void do_mp_error(val self, mp_err code); void arith_init(void); void arith_compat_fixup(int compat_ver); diff --git a/psquare.c b/psquare.c new file mode 100644 index 00000000..0a1dfb05 --- /dev/null +++ b/psquare.c @@ -0,0 +1,169 @@ +#include <stddef.h> +#include <wchar.h> +#include <limits.h> +#include <string.h> +#include <stdlib.h> +#include <stdarg.h> +#include <signal.h> +#include <stdio.h> +#include <math.h> +#include "config.h" +#include "lib.h" +#include "signal.h" +#include "unwind.h" +#include "psquare.h" + +static void psq_reset(struct psquare *psq) +{ + double p = psq->p; + + psq->n[0] = 1; + psq->n[1] = 2; + psq->n[2] = 3; + psq->n[3] = 4; + psq->n[4] = 5; + + psq->wn[0] = 1.0; + psq->wn[1] = 1.0 + 2.0 * p; + psq->wn[2] = 1.0 + 4.0 * p; + psq->wn[3] = 3.0 + 2.0 * p; + psq->wn[4] = 5.0; + + psq->dn[1] = p / 2.0; + psq->dn[2] = p; + psq->dn[3] = (1.0 + p) / 2.0; + psq->dn[4] = 1.0; +} + +void psq_init(struct psquare *psq, double p) +{ + memset(psq, 0, sizeof *psq); + psq->p = p; + psq_reset(psq); +} + +void psq_init_grouped(struct psquare *psq, double p, + ucnum grsize, double rate) +{ + psq_init(psq, p); + psq->type = psq_grouped; + psq->grsize = grsize; + psq->rate = rate; + psq->blend = 1.0; +} + +static int dbl_cmp(const void *lp, const void *rp) +{ + double ln = *coerce(const double *, lp); + double rn = *coerce(const double *, rp); + + if (ln < rn) + return -1; + if (ln > rn) + return 1; + return 0; +} + +void psq_add_sample(struct psquare *psq, double s, val self) +{ + ucnum c = psq->count++; + + if (psq->type == psq_grouped && c >= psq->grsize) { + psq->prev = psq_get_estimate(psq); + psq->blending = 1; + psq->count = 1; + psq_reset(psq); + c = 0; + } + + if (c < 5) { + psq->q[c] = s; + } else { + int k = 0, i; + + if (c == 5) + qsort(psq->q, c, sizeof psq->q[0], dbl_cmp); + + if (s < psq->q[0]) { + psq->q[0] = s; + } else if (psq->q[4] <= s) { + psq->q[4] = s; + k = 3; + } else for (k = 0; k < 4; k++) { + if (psq->q[k] <= s && s < psq->q[k + 1]) + break; + } + + if (psq->n[4] == INT_PTR_MAX) + uw_throwf(numeric_error_s, lit("~a: sample capacity overflow"), + self, nao); + + for (i = k + 1; i < 5; i++) + psq->n[i]++; + + for (i = 0; i < 5; i++) + psq->wn[i] += psq->dn[i]; + + for (i = 1; i <= 3; i++) { + double d = psq->wn[i] - psq->n[i]; + double ds = psq->n[i + 1] - psq->n[i]; + double dp = psq->n[i - 1] - psq->n[i]; + + if ((d >= 1 && ds > 1) || (d <= -1 && dp < -1)) { + int sgd = d < 0 ? -1 : 1; + double qs = (psq->q[i + 1] - psq->q[i]) / ds; + double qp = (psq->q[i - 1] - psq->q[i]) / dp; + double q = psq->q[i] + sgd/(ds - dp)*((sgd - dp)*qs + (ds - sgd)*qp); + + if (psq->q[i - 1] < q && q < psq->q[i + 1]) { + psq->q[i] = q; + } else { + if (d > 0) + psq->q[i] += qs; + else if (d < 0) + psq->q[i] -= qp; + } + + psq->n[i] += sgd; + } + } + } + + if (psq->blending) + psq->blend *= psq->rate; +} + +double psq_get_estimate(struct psquare *psq) +{ + ucnum c = psq->count; + double est; + + if (c > 2 && c < 5) + qsort(psq->q, c, sizeof psq->q[0], dbl_cmp); + + switch (psq->count) { + case 0: + est = 0.0; + break; + case 1: + est = psq->q[0]; + break; + case 2: + est = psq->q[0] + (psq->q[1] - psq->q[0]) / 2; + break; + case 3: + est = psq->q[1]; + break; + case 4: + est = psq->q[1] + (psq->q[2] - psq->q[1]) / 2; + break; + default: + est = psq->q[2]; + break; + } + + if (!psq->blending) + return est; + + return psq->blend * psq->prev + (1 - psq->blend) * est; +} diff --git a/psquare.h b/psquare.h new file mode 100644 index 00000000..77580e02 --- /dev/null +++ b/psquare.h @@ -0,0 +1,54 @@ +/* Copyright 2021 + * Kaz Kylheku <kaz@kylheku.com> + * Vancouver, Canada + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +enum psq_type { + psq_regular, + psq_grouped +}; + +struct psquare { + enum psq_type type; + double p; + double q[5]; + double wn[5]; + double dn[5]; + cnum n[5]; + ucnum count; + /* psq_grouped fields */ + ucnum grsize; + double prev; + double rate; + double blend; + int blending; +}; + +void psq_init(struct psquare *, double p); +void psq_init_grouped(struct psquare *, double p, + ucnum grsize, double rate); +void psq_add_sample(struct psquare *, double s, val self); +double psq_get_estimate(struct psquare *); diff --git a/stdlib/doc-syms.tl b/stdlib/doc-syms.tl index 94b2cf89..60a81501 100644 --- a/stdlib/doc-syms.tl +++ b/stdlib/doc-syms.tl @@ -1503,6 +1503,7 @@ ("pwd" "N-0047F5F6") ("qquote" "N-01665185") ("qref" "D-006E") + ("quantile" "N-0318C018") ("quip" "N-03C6D422") ("quote" "N-0163F998") ("r$" "N-03BBB0C5") diff --git a/tests/016/arith.tl b/tests/016/arith.tl index be0d512b..34a82c7f 100644 --- a/tests/016/arith.tl +++ b/tests/016/arith.tl @@ -222,3 +222,34 @@ (< '(1 2 3) #(1 2 3.0)) nil (< '(1 2 3) #(1 2 3 4)) t (< '(1 2 3) #(1 2 4)) t) + +(test + (let ((q (quantile 0.5))) + [q 0.02 0.5 0.74 3.39 0.83] + [mapcar q '(22.37 10.15 15.43 38.62 15.92 + 34.60 10.28 1.47 0.40 0.05 11.39 + 0.27 0.42 0.09 11.37)]) + (0.73999999999999999 0.73999999999999999 2.0616666666666665 + 4.5517592592592591 4.5517592592592591 9.1519618055555547 + 9.1519618055555547 9.1519618055555547 9.1519618055555547 + 6.1797614914021164 6.1797614914021164 6.1797614914021164 + 6.1797614914021164 4.2462394088036444 4.2462394088036444)) + +(test + (let ((q (quantile 0))) + (cons [q] [mapcar q '(1 2 3 4 5)])) + (0.0 1.0 1.5 2.0 2.5 3.0)) + +(test + (let ((q (quantile 0 5 0.5))) + [mapcar q '(1.0 2.0 3.0 4.0 5.0 + 0.0 0.0 0.0 0.0 0.0)]) + (1.0 1.5 2.0 2.5 3.0 + 1.5 0.75 0.375 0.1875 0.09375)) + +(test + (let ((q (quantile 0 5 0.5))) + [mapcar q '(0.0 0.0 0.0 0.0 0.0 + 3.0 3.0 3.0 3.0 3.0)]) + (0.0 0.0 0.0 0.0 0.0 + 1.5 2.25 2.625 2.8125 2.90625)) @@ -46577,6 +46577,149 @@ benefits from ordering the operations on multiple integer operands according to the magnitudes of those operands. The function provides an estimate of magnitude which trades accuracy for efficiency. +.coNP Function @ quantile +.synb +.mets (quantile < p >> [ group-size <> [ rate ]]) +.syne +.desc +The +.code quantile +function returns a function which estimates a specific quantile +of a set of real-valued samples. The desired quantile is indicated +by the +.meta p +parameter, which is a number in the range 0 to 1.0. If +.meta p +is specified as 0.5, then the median is estimated. +The +.meta p +value of 0.9 leads to the estimation of the 90th percentile: +a value such that approximately 90% of the samples are below that value. + +If the +.meta group-size +parameter is specified, it must be a positive integer. +The returned function then operates in grouped mode. The +.meta rate +parameter is relevant only to grouped mode. Grouped mode is +described below. + +The function returned by +.code quantile +maintains internal state in relation to calculating the quantile. +The function may be called with any number of arguments, including +none. It expects every argument to be either a number, or a sequence +of numbers. These numbers are accumulated into the quantile calculation, +and a revised estimate of the quantile is then returned. + +Note: the algorithm used is the P-Squared algorithm invented in 1985 by Raj +Jain and Imrich Chlamtac, which avoids accumulating and sorting the entire data +set, while still obtaining good quality estimates of the quantile. +The algorithm requires an initial seed of five samples. Then additional +samples input into the algorithm produce quantile estimates. To eliminate this +special case from the abstract interface, the \*(TX implementation is capable of +producing an estimate when five or fewer samples have been presented, including +none. In this low situation, the +.meta p +value is ignored in reporting the estimate. When no samples have been given, +the estimate is zero. When one sample has been given, the estimate is that +sample itself. When between two and five samples have been given, the estimate +is their median. Using the median as the estimate ensures a smooth transition +from these early estimates into the estimates produced by the P-Squared +algorithm. This is because the P-Squared algorithm always reports the value of +the middle height accumulator as the estimate, and that accumulator's initial +value is the median of the first five samples. + +The function returned by +.codn quantile , +though not accumulating all of the samples passed to it, nevertheless has +a limited sample capacity, because the registers it uses for tracking the +sample group positions are fixed-width integers. The sample capacity is +approximately 4 times the value of +.codn fixnum-max . + +.TP* Example: + +.verb + (defparm q (quantile 0.9)) ;; create 90-th percentile accumulator + + [q] -> 0.0 ;; no samples given: estimate is 0. + [q 3.14] -> 3.14 ;; one sample: estimate is that sample + [q 13.3 7.9 5.2 6.3] -> 7.9 ;; five samples: estimate is median. + [q 6.8 7.3 9.1 4.0] ;; more than five samples; estimate now + -> 8.44651234567901 ;; from P-Square algorithm + [q #(13.1 5 2.5)] ;; vector argument + -> 9.68660493827161 + [q] -> 9.68660493827161 ;; no arguments: repeat current estimate +.brev + +If the +.meta group-size +argument is specified, then the quantile accumulator operates in grouped mode. +Grouped mode allows infinite sample operation without overflow: an unlimited +number of samples can be accepted. However, old samples lose their influence +over the estimated value: newer samples are considered more significant than +old samples. + +In grouped mode, the quantile accumulator is reset to its initial state whenever +.meta group-size +samples have been accumulated, and begins freshly calculating the quantile. +Prior to the reset, an estimate is obtained and retained in an internal +register. Going forward, this remembered previous estimate is blended in with +the newly calculated estimate values, as described below. The cycle repeats +itself whenever +.meta group-size +samples accumulate: the state is reset, and the current estimate is loaded into +the previous estimate register, which is then blended with newly computed +values. + +The +.meta rate +parameter, whose default value is 0.999, controls the estimate blending. +It should be a value between 0 and 1. + +Upon each reset, a blend value register is initialized to 1.0. Each time +a new sample is accumulated, the blend register is multiplied by the rate +parameter, and the product is stored back into the blend register. +Thus if the rate is between 0 and 1, exclusive, then the blend register +exponentially decreases as the number of samples grows. The blend register +indicates the fraction of the estimate which comes from the remembered previous +estimate. + +For instance, if the current blend value is 0.8, then the returned estimate +value is 0.8 times the remembered previous estimate, plus 0.2 times the newly +computed estimate for the current sample in the new group: the previous and +current estimate are blended 80:20. + +The default +.meta rate +value of 0.999 is chosen for a slow transition to the new estimates, which +helps to conceal inaccuracies in the algorithm associated with having +accumulated a small number of samples. At this rate, it requires about 290 +samples before the blend value drops to 75% of the old estimate. + +If +.code rate +is specified as 0, then no blending of the previous estimate value +takes place, since the blend factor will drop to zero upon the first +sample being received after the group reset, causing the newly calculated +estimates to be returned without blending. The previous sample groups +therefore have no influence over newer estimates. If +.code rate +is specified as 1, then the blend factor will stay at 1, and so the +estimate will forever remain at the previous value, ignoring the +calculations driven by the new samples. + +Note: it is recommended that if +.meta group-size +is specified, the value should be at least several hundred. Too small +a group size will prevent the estimation algorithm from settling on +good results. The +.meta rate +parameter should not much smaller than 1. A rate too low will cause +the previous estimate's contribution to the quantile value to diminish, +too quickly, before the new estimation settles. + .coNP Variables @, flo-near @, flo-down @ flo-up and @ flo-zero .desc These variables hold integer values suitable as arguments to the |