summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile2
-rw-r--r--arith.c51
-rw-r--r--arith.h2
-rw-r--r--psquare.c169
-rw-r--r--psquare.h54
-rw-r--r--stdlib/doc-syms.tl1
-rw-r--r--tests/016/arith.tl31
-rw-r--r--txr.1143
8 files changed, 452 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index bf79d15b..d8bc6797 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ OBJS := txr.o lex.yy.o y.tab.o match.o lib.o regex.o gc.o unwind.o stream.o
OBJS += arith.o hash.o utf8.o filter.o eval.o parser.o rand.o combi.o sysif.o
OBJS += args.o lisplib.o cadr.o struct.o itypes.o buf.o jmp.o protsym.o ffi.o
OBJS += strudel.o vm.o chksum.o chksums/sha256.o chksums/crc32.o chksums/md5.o
-OBJS += tree.o time.o
+OBJS += tree.o time.o psquare.o
OBJS += linenoise/linenoise.o
OBJS-$(debug_support) += debug.o
OBJS-$(have_syslog) += syslog.o
diff --git a/arith.c b/arith.c
index d649562f..d3f2b186 100644
--- a/arith.c
+++ b/arith.c
@@ -50,6 +50,7 @@
#include "itypes.h"
#include "struct.h"
#include "txr.h"
+#include "psquare.h"
#include "arith.h"
#define max(a, b) ((a) > (b) ? (a) : (b))
@@ -4507,6 +4508,55 @@ val lcmv(struct args *nlist)
return nary_op(lit("lcm"), lcm, abso_self, nlist, zero);
}
+static struct cobj_ops psq_ops = cobj_ops_init(cobj_equal_handle_op,
+ cptr_print_op,
+ cobj_destroy_free_op,
+ cobj_mark_op,
+ cobj_handle_hash_op);
+
+static val quant_fun(val psqo, struct args *args)
+{
+ val self = lit("quantile");
+ cnum idx = 0;
+ struct psquare *psq = coerce(struct psquare *, psqo->cp.handle);
+
+ while (args_more(args, idx)) {
+ val arg = args_get(args, &idx);
+ if (numberp(arg)) {
+ double s = c_flo(to_float(self, arg), self);
+ psq_add_sample(psq, s, self);
+ } else {
+ seq_iter_t item_iter;
+ val sample;
+ seq_iter_init(self, &item_iter, arg);
+
+ while (seq_get(&item_iter, &sample)) {
+ double s = c_flo(to_float(self, sample), self);
+ psq_add_sample(psq, s, self);
+ }
+ }
+ }
+
+ return flo(psq_get_estimate(psq));
+}
+
+val quantile(val pv, val grsize_in, val rate_in)
+{
+ val self = lit("quantile");
+ double p = c_flo(to_float(self, pv), self);
+ struct psquare *psq = coerce(struct psquare *, chk_malloc(sizeof *psq));
+ val psqo = cptr_typed(coerce(mem_t *, psq), nil, &psq_ops);
+ if (missingp(grsize_in)) {
+ psq_init(psq, p);
+ } else {
+ ucnum grsize = c_unum(grsize_in, self);
+ double rate = if3(missingp(rate_in),
+ 0.999,
+ c_flo(to_float(self, rate_in), self));
+ psq_init_grouped(psq, p, grsize, rate);
+ }
+ return func_f0v(psqo, quant_fun);
+}
void arith_init(void)
{
@@ -4707,6 +4757,7 @@ void arith_init(void)
reg_fun(intern(lit("b/"), system_package), func_n2(divi));
reg_fun(neg_s, func_n1(neg));
+ reg_fun(intern(lit("quantile"), user_package), func_n3o(quantile, 1));
#if HAVE_ROUNDING_CTL_H
reg_varl(intern(lit("flo-near"), user_package), num(FE_TONEAREST));
reg_varl(intern(lit("flo-down"), user_package), num(FE_DOWNWARD));
diff --git a/arith.h b/arith.h
index 1651853a..21288519 100644
--- a/arith.h
+++ b/arith.h
@@ -61,6 +61,8 @@ val digits(val n, val base);
val poly(val x, val seq);
val rpoly(val x, val seq);
+val quantile(val pv, val chsize_in, val rate_in);
+
NORETURN void do_mp_error(val self, mp_err code);
void arith_init(void);
void arith_compat_fixup(int compat_ver);
diff --git a/psquare.c b/psquare.c
new file mode 100644
index 00000000..0a1dfb05
--- /dev/null
+++ b/psquare.c
@@ -0,0 +1,169 @@
+#include <stddef.h>
+#include <wchar.h>
+#include <limits.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <stdio.h>
+#include <math.h>
+#include "config.h"
+#include "lib.h"
+#include "signal.h"
+#include "unwind.h"
+#include "psquare.h"
+
+static void psq_reset(struct psquare *psq)
+{
+ double p = psq->p;
+
+ psq->n[0] = 1;
+ psq->n[1] = 2;
+ psq->n[2] = 3;
+ psq->n[3] = 4;
+ psq->n[4] = 5;
+
+ psq->wn[0] = 1.0;
+ psq->wn[1] = 1.0 + 2.0 * p;
+ psq->wn[2] = 1.0 + 4.0 * p;
+ psq->wn[3] = 3.0 + 2.0 * p;
+ psq->wn[4] = 5.0;
+
+ psq->dn[1] = p / 2.0;
+ psq->dn[2] = p;
+ psq->dn[3] = (1.0 + p) / 2.0;
+ psq->dn[4] = 1.0;
+}
+
+void psq_init(struct psquare *psq, double p)
+{
+ memset(psq, 0, sizeof *psq);
+ psq->p = p;
+ psq_reset(psq);
+}
+
+void psq_init_grouped(struct psquare *psq, double p,
+ ucnum grsize, double rate)
+{
+ psq_init(psq, p);
+ psq->type = psq_grouped;
+ psq->grsize = grsize;
+ psq->rate = rate;
+ psq->blend = 1.0;
+}
+
+static int dbl_cmp(const void *lp, const void *rp)
+{
+ double ln = *coerce(const double *, lp);
+ double rn = *coerce(const double *, rp);
+
+ if (ln < rn)
+ return -1;
+ if (ln > rn)
+ return 1;
+ return 0;
+}
+
+void psq_add_sample(struct psquare *psq, double s, val self)
+{
+ ucnum c = psq->count++;
+
+ if (psq->type == psq_grouped && c >= psq->grsize) {
+ psq->prev = psq_get_estimate(psq);
+ psq->blending = 1;
+ psq->count = 1;
+ psq_reset(psq);
+ c = 0;
+ }
+
+ if (c < 5) {
+ psq->q[c] = s;
+ } else {
+ int k = 0, i;
+
+ if (c == 5)
+ qsort(psq->q, c, sizeof psq->q[0], dbl_cmp);
+
+ if (s < psq->q[0]) {
+ psq->q[0] = s;
+ } else if (psq->q[4] <= s) {
+ psq->q[4] = s;
+ k = 3;
+ } else for (k = 0; k < 4; k++) {
+ if (psq->q[k] <= s && s < psq->q[k + 1])
+ break;
+ }
+
+ if (psq->n[4] == INT_PTR_MAX)
+ uw_throwf(numeric_error_s, lit("~a: sample capacity overflow"),
+ self, nao);
+
+ for (i = k + 1; i < 5; i++)
+ psq->n[i]++;
+
+ for (i = 0; i < 5; i++)
+ psq->wn[i] += psq->dn[i];
+
+ for (i = 1; i <= 3; i++) {
+ double d = psq->wn[i] - psq->n[i];
+ double ds = psq->n[i + 1] - psq->n[i];
+ double dp = psq->n[i - 1] - psq->n[i];
+
+ if ((d >= 1 && ds > 1) || (d <= -1 && dp < -1)) {
+ int sgd = d < 0 ? -1 : 1;
+ double qs = (psq->q[i + 1] - psq->q[i]) / ds;
+ double qp = (psq->q[i - 1] - psq->q[i]) / dp;
+ double q = psq->q[i] + sgd/(ds - dp)*((sgd - dp)*qs + (ds - sgd)*qp);
+
+ if (psq->q[i - 1] < q && q < psq->q[i + 1]) {
+ psq->q[i] = q;
+ } else {
+ if (d > 0)
+ psq->q[i] += qs;
+ else if (d < 0)
+ psq->q[i] -= qp;
+ }
+
+ psq->n[i] += sgd;
+ }
+ }
+ }
+
+ if (psq->blending)
+ psq->blend *= psq->rate;
+}
+
+double psq_get_estimate(struct psquare *psq)
+{
+ ucnum c = psq->count;
+ double est;
+
+ if (c > 2 && c < 5)
+ qsort(psq->q, c, sizeof psq->q[0], dbl_cmp);
+
+ switch (psq->count) {
+ case 0:
+ est = 0.0;
+ break;
+ case 1:
+ est = psq->q[0];
+ break;
+ case 2:
+ est = psq->q[0] + (psq->q[1] - psq->q[0]) / 2;
+ break;
+ case 3:
+ est = psq->q[1];
+ break;
+ case 4:
+ est = psq->q[1] + (psq->q[2] - psq->q[1]) / 2;
+ break;
+ default:
+ est = psq->q[2];
+ break;
+ }
+
+ if (!psq->blending)
+ return est;
+
+ return psq->blend * psq->prev + (1 - psq->blend) * est;
+}
diff --git a/psquare.h b/psquare.h
new file mode 100644
index 00000000..77580e02
--- /dev/null
+++ b/psquare.h
@@ -0,0 +1,54 @@
+/* Copyright 2021
+ * Kaz Kylheku <kaz@kylheku.com>
+ * Vancouver, Canada
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+enum psq_type {
+ psq_regular,
+ psq_grouped
+};
+
+struct psquare {
+ enum psq_type type;
+ double p;
+ double q[5];
+ double wn[5];
+ double dn[5];
+ cnum n[5];
+ ucnum count;
+ /* psq_grouped fields */
+ ucnum grsize;
+ double prev;
+ double rate;
+ double blend;
+ int blending;
+};
+
+void psq_init(struct psquare *, double p);
+void psq_init_grouped(struct psquare *, double p,
+ ucnum grsize, double rate);
+void psq_add_sample(struct psquare *, double s, val self);
+double psq_get_estimate(struct psquare *);
diff --git a/stdlib/doc-syms.tl b/stdlib/doc-syms.tl
index 94b2cf89..60a81501 100644
--- a/stdlib/doc-syms.tl
+++ b/stdlib/doc-syms.tl
@@ -1503,6 +1503,7 @@
("pwd" "N-0047F5F6")
("qquote" "N-01665185")
("qref" "D-006E")
+ ("quantile" "N-0318C018")
("quip" "N-03C6D422")
("quote" "N-0163F998")
("r$" "N-03BBB0C5")
diff --git a/tests/016/arith.tl b/tests/016/arith.tl
index be0d512b..34a82c7f 100644
--- a/tests/016/arith.tl
+++ b/tests/016/arith.tl
@@ -222,3 +222,34 @@
(< '(1 2 3) #(1 2 3.0)) nil
(< '(1 2 3) #(1 2 3 4)) t
(< '(1 2 3) #(1 2 4)) t)
+
+(test
+ (let ((q (quantile 0.5)))
+ [q 0.02 0.5 0.74 3.39 0.83]
+ [mapcar q '(22.37 10.15 15.43 38.62 15.92
+ 34.60 10.28 1.47 0.40 0.05 11.39
+ 0.27 0.42 0.09 11.37)])
+ (0.73999999999999999 0.73999999999999999 2.0616666666666665
+ 4.5517592592592591 4.5517592592592591 9.1519618055555547
+ 9.1519618055555547 9.1519618055555547 9.1519618055555547
+ 6.1797614914021164 6.1797614914021164 6.1797614914021164
+ 6.1797614914021164 4.2462394088036444 4.2462394088036444))
+
+(test
+ (let ((q (quantile 0)))
+ (cons [q] [mapcar q '(1 2 3 4 5)]))
+ (0.0 1.0 1.5 2.0 2.5 3.0))
+
+(test
+ (let ((q (quantile 0 5 0.5)))
+ [mapcar q '(1.0 2.0 3.0 4.0 5.0
+ 0.0 0.0 0.0 0.0 0.0)])
+ (1.0 1.5 2.0 2.5 3.0
+ 1.5 0.75 0.375 0.1875 0.09375))
+
+(test
+ (let ((q (quantile 0 5 0.5)))
+ [mapcar q '(0.0 0.0 0.0 0.0 0.0
+ 3.0 3.0 3.0 3.0 3.0)])
+ (0.0 0.0 0.0 0.0 0.0
+ 1.5 2.25 2.625 2.8125 2.90625))
diff --git a/txr.1 b/txr.1
index 498f9605..8281de4a 100644
--- a/txr.1
+++ b/txr.1
@@ -46577,6 +46577,149 @@ benefits from ordering the operations on multiple integer operands
according to the magnitudes of those operands. The function provides an
estimate of magnitude which trades accuracy for efficiency.
+.coNP Function @ quantile
+.synb
+.mets (quantile < p >> [ group-size <> [ rate ]])
+.syne
+.desc
+The
+.code quantile
+function returns a function which estimates a specific quantile
+of a set of real-valued samples. The desired quantile is indicated
+by the
+.meta p
+parameter, which is a number in the range 0 to 1.0. If
+.meta p
+is specified as 0.5, then the median is estimated.
+The
+.meta p
+value of 0.9 leads to the estimation of the 90th percentile:
+a value such that approximately 90% of the samples are below that value.
+
+If the
+.meta group-size
+parameter is specified, it must be a positive integer.
+The returned function then operates in grouped mode. The
+.meta rate
+parameter is relevant only to grouped mode. Grouped mode is
+described below.
+
+The function returned by
+.code quantile
+maintains internal state in relation to calculating the quantile.
+The function may be called with any number of arguments, including
+none. It expects every argument to be either a number, or a sequence
+of numbers. These numbers are accumulated into the quantile calculation,
+and a revised estimate of the quantile is then returned.
+
+Note: the algorithm used is the P-Squared algorithm invented in 1985 by Raj
+Jain and Imrich Chlamtac, which avoids accumulating and sorting the entire data
+set, while still obtaining good quality estimates of the quantile.
+The algorithm requires an initial seed of five samples. Then additional
+samples input into the algorithm produce quantile estimates. To eliminate this
+special case from the abstract interface, the \*(TX implementation is capable of
+producing an estimate when five or fewer samples have been presented, including
+none. In this low situation, the
+.meta p
+value is ignored in reporting the estimate. When no samples have been given,
+the estimate is zero. When one sample has been given, the estimate is that
+sample itself. When between two and five samples have been given, the estimate
+is their median. Using the median as the estimate ensures a smooth transition
+from these early estimates into the estimates produced by the P-Squared
+algorithm. This is because the P-Squared algorithm always reports the value of
+the middle height accumulator as the estimate, and that accumulator's initial
+value is the median of the first five samples.
+
+The function returned by
+.codn quantile ,
+though not accumulating all of the samples passed to it, nevertheless has
+a limited sample capacity, because the registers it uses for tracking the
+sample group positions are fixed-width integers. The sample capacity is
+approximately 4 times the value of
+.codn fixnum-max .
+
+.TP* Example:
+
+.verb
+ (defparm q (quantile 0.9)) ;; create 90-th percentile accumulator
+
+ [q] -> 0.0 ;; no samples given: estimate is 0.
+ [q 3.14] -> 3.14 ;; one sample: estimate is that sample
+ [q 13.3 7.9 5.2 6.3] -> 7.9 ;; five samples: estimate is median.
+ [q 6.8 7.3 9.1 4.0] ;; more than five samples; estimate now
+ -> 8.44651234567901 ;; from P-Square algorithm
+ [q #(13.1 5 2.5)] ;; vector argument
+ -> 9.68660493827161
+ [q] -> 9.68660493827161 ;; no arguments: repeat current estimate
+.brev
+
+If the
+.meta group-size
+argument is specified, then the quantile accumulator operates in grouped mode.
+Grouped mode allows infinite sample operation without overflow: an unlimited
+number of samples can be accepted. However, old samples lose their influence
+over the estimated value: newer samples are considered more significant than
+old samples.
+
+In grouped mode, the quantile accumulator is reset to its initial state whenever
+.meta group-size
+samples have been accumulated, and begins freshly calculating the quantile.
+Prior to the reset, an estimate is obtained and retained in an internal
+register. Going forward, this remembered previous estimate is blended in with
+the newly calculated estimate values, as described below. The cycle repeats
+itself whenever
+.meta group-size
+samples accumulate: the state is reset, and the current estimate is loaded into
+the previous estimate register, which is then blended with newly computed
+values.
+
+The
+.meta rate
+parameter, whose default value is 0.999, controls the estimate blending.
+It should be a value between 0 and 1.
+
+Upon each reset, a blend value register is initialized to 1.0. Each time
+a new sample is accumulated, the blend register is multiplied by the rate
+parameter, and the product is stored back into the blend register.
+Thus if the rate is between 0 and 1, exclusive, then the blend register
+exponentially decreases as the number of samples grows. The blend register
+indicates the fraction of the estimate which comes from the remembered previous
+estimate.
+
+For instance, if the current blend value is 0.8, then the returned estimate
+value is 0.8 times the remembered previous estimate, plus 0.2 times the newly
+computed estimate for the current sample in the new group: the previous and
+current estimate are blended 80:20.
+
+The default
+.meta rate
+value of 0.999 is chosen for a slow transition to the new estimates, which
+helps to conceal inaccuracies in the algorithm associated with having
+accumulated a small number of samples. At this rate, it requires about 290
+samples before the blend value drops to 75% of the old estimate.
+
+If
+.code rate
+is specified as 0, then no blending of the previous estimate value
+takes place, since the blend factor will drop to zero upon the first
+sample being received after the group reset, causing the newly calculated
+estimates to be returned without blending. The previous sample groups
+therefore have no influence over newer estimates. If
+.code rate
+is specified as 1, then the blend factor will stay at 1, and so the
+estimate will forever remain at the previous value, ignoring the
+calculations driven by the new samples.
+
+Note: it is recommended that if
+.meta group-size
+is specified, the value should be at least several hundred. Too small
+a group size will prevent the estimation algorithm from settling on
+good results. The
+.meta rate
+parameter should not much smaller than 1. A rate too low will cause
+the previous estimate's contribution to the quantile value to diminish,
+too quickly, before the new estimation settles.
+
.coNP Variables @, flo-near @, flo-down @ flo-up and @ flo-zero
.desc
These variables hold integer values suitable as arguments to the