8 files changed, 452 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index bf79d15b..d8bc6797 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ OBJS := txr.o lex.yy.o y.tab.o match.o lib.o regex.o gc.o unwind.o stream.o
 OBJS += arith.o hash.o utf8.o filter.o eval.o parser.o rand.o combi.o sysif.o
 OBJS += args.o lisplib.o cadr.o struct.o itypes.o buf.o jmp.o protsym.o ffi.o
 OBJS += strudel.o vm.o chksum.o chksums/sha256.o chksums/crc32.o chksums/md5.o
-OBJS += tree.o time.o
+OBJS += tree.o time.o psquare.o
 OBJS += linenoise/linenoise.o
 OBJS-$(debug_support) += debug.o
 OBJS-$(have_syslog) += syslog.o
diff --git a/arith.c b/arith.c
index d649562f..d3f2b186 100644
--- a/arith.c
+++ b/arith.c
@@ -50,6 +50,7 @@
 #include "itypes.h"
 #include "struct.h"
 #include "txr.h"
+#include "psquare.h"
 #include "arith.h"
 
 #define max(a, b) ((a) > (b) ? (a) : (b))
@@ -4507,6 +4508,55 @@ val lcmv(struct args *nlist)
   return nary_op(lit("lcm"), lcm, abso_self, nlist, zero);
 }
 
+static struct cobj_ops psq_ops = cobj_ops_init(cobj_equal_handle_op,
+                                               cptr_print_op,
+                                               cobj_destroy_free_op,
+                                               cobj_mark_op,
+                                               cobj_handle_hash_op);
+
+static val quant_fun(val psqo, struct args *args)
+{
+  val self = lit("quantile");
+  cnum idx = 0;
+  struct psquare *psq = coerce(struct psquare *, psqo->cp.handle);
+
+  while (args_more(args, idx)) {
+    val arg = args_get(args, &idx);
+    if (numberp(arg)) {
+      double s = c_flo(to_float(self, arg), self);
+      psq_add_sample(psq, s, self);
+    } else {
+      seq_iter_t item_iter;
+      val sample;
+      seq_iter_init(self, &item_iter, arg);
+
+      while (seq_get(&item_iter, &sample)) {
+        double s = c_flo(to_float(self, sample), self);
+        psq_add_sample(psq, s, self);
+      }
+    }
+  }
+
+  return flo(psq_get_estimate(psq));
+}
+
+val quantile(val pv, val grsize_in, val rate_in)
+{
+  val self = lit("quantile");
+  double p = c_flo(to_float(self, pv), self);
+  struct psquare *psq = coerce(struct psquare *, chk_malloc(sizeof *psq));
+  val psqo = cptr_typed(coerce(mem_t *, psq), nil, &psq_ops);
+  if (missingp(grsize_in)) {
+    psq_init(psq, p);
+  } else {
+    ucnum grsize = c_unum(grsize_in, self);
+    double rate = if3(missingp(rate_in),
+                      0.999,
+                      c_flo(to_float(self, rate_in), self));
+    psq_init_grouped(psq, p, grsize, rate);
+  }
+  return func_f0v(psqo, quant_fun);
+}
 
 void arith_init(void)
 {
@@ -4707,6 +4757,7 @@ void arith_init(void)
   reg_fun(intern(lit("b/"), system_package), func_n2(divi));
   reg_fun(neg_s, func_n1(neg));
 
+  reg_fun(intern(lit("quantile"), user_package), func_n3o(quantile, 1));
 #if HAVE_ROUNDING_CTL_H
   reg_varl(intern(lit("flo-near"), user_package), num(FE_TONEAREST));
   reg_varl(intern(lit("flo-down"), user_package), num(FE_DOWNWARD));
diff --git a/arith.h b/arith.h
index 1651853a..21288519 100644
--- a/arith.h
+++ b/arith.h
@@ -61,6 +61,8 @@ val digits(val n, val base);
 val poly(val x, val seq);
 val rpoly(val x, val seq);
 
+val quantile(val pv, val chsize_in, val rate_in);
+
 NORETURN void do_mp_error(val self, mp_err code);
 void arith_init(void);
 void arith_compat_fixup(int compat_ver);
diff --git a/psquare.c b/psquare.c
new file mode 100644
index 00000000..0a1dfb05
--- /dev/null
+++ b/psquare.c
@@ -0,0 +1,169 @@
+#include <stddef.h>
+#include <wchar.h>
+#include <limits.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <stdio.h>
+#include <math.h>
+#include "config.h"
+#include "lib.h"
+#include "signal.h"
+#include "unwind.h"
+#include "psquare.h"
+
+static void psq_reset(struct psquare *psq)
+{
+  double p = psq->p;
+
+  psq->n[0] = 1;
+  psq->n[1] = 2;
+  psq->n[2] = 3;
+  psq->n[3] = 4;
+  psq->n[4] = 5;
+
+  psq->wn[0] = 1.0;
+  psq->wn[1] = 1.0 + 2.0 * p;
+  psq->wn[2] = 1.0 + 4.0 * p;
+  psq->wn[3] = 3.0 + 2.0 * p;
+  psq->wn[4] = 5.0;
+
+  psq->dn[1] = p / 2.0;
+  psq->dn[2] = p;
+  psq->dn[3] = (1.0 + p) / 2.0;
+  psq->dn[4] = 1.0;
+}
+
+void psq_init(struct psquare *psq, double p)
+{
+  memset(psq, 0, sizeof *psq);
+  psq->p = p;
+  psq_reset(psq);
+}
+
+void psq_init_grouped(struct psquare *psq, double p,
+                      ucnum grsize, double rate)
+{
+  psq_init(psq, p);
+  psq->type = psq_grouped;
+  psq->grsize = grsize;
+  psq->rate = rate;
+  psq->blend = 1.0;
+}
+
+static int dbl_cmp(const void *lp, const void *rp)
+{
+  double ln = *coerce(const double *, lp);
+  double rn = *coerce(const double *, rp);
+
+  if (ln < rn)
+    return -1;
+  if (ln > rn)
+    return 1;
+  return 0;
+}
+
+void psq_add_sample(struct psquare *psq, double s, val self)
+{
+  ucnum c = psq->count++;
+
+  if (psq->type == psq_grouped && c >= psq->grsize) {
+    psq->prev = psq_get_estimate(psq);
+    psq->blending = 1;
+    psq->count = 1;
+    psq_reset(psq);
+    c = 0;
+  }
+
+  if (c < 5) {
+    psq->q[c] = s;
+  } else {
+    int k = 0, i;
+
+    if (c == 5)
+      qsort(psq->q, c, sizeof psq->q[0], dbl_cmp);
+
+    if (s < psq->q[0]) {
+      psq->q[0] = s;
+    } else if (psq->q[4] <= s) {
+      psq->q[4] = s;
+      k = 3;
+    } else for (k = 0; k < 4; k++) {
+      if (psq->q[k] <= s && s < psq->q[k + 1])
+        break;
+    }
+
+    if (psq->n[4] == INT_PTR_MAX)
+      uw_throwf(numeric_error_s, lit("~a: sample capacity overflow"),
+                self, nao);
+
+    for (i = k + 1; i < 5; i++)
+      psq->n[i]++;
+
+    for (i = 0; i < 5; i++)
+      psq->wn[i] += psq->dn[i];
+
+    for (i = 1; i <= 3; i++) {
+      double d = psq->wn[i] - psq->n[i];
+      double ds = psq->n[i + 1] - psq->n[i];
+      double dp = psq->n[i - 1] - psq->n[i];
+
+      if ((d >= 1 && ds > 1) || (d <= -1 && dp < -1)) {
+        int sgd = d < 0 ? -1 : 1;
+        double qs = (psq->q[i + 1] - psq->q[i]) / ds;
+        double qp = (psq->q[i - 1] - psq->q[i]) / dp;
+        double q = psq->q[i] + sgd/(ds - dp)*((sgd - dp)*qs + (ds - sgd)*qp);
+
+        if (psq->q[i - 1] < q && q < psq->q[i + 1]) {
+          psq->q[i] = q;
+        } else {
+          if (d > 0)
+            psq->q[i] += qs;
+          else if (d < 0)
+            psq->q[i] -= qp;
+        }
+
+        psq->n[i] += sgd;
+      }
+    }
+  }
+
+  if (psq->blending)
+    psq->blend *= psq->rate;
+}
+
+double psq_get_estimate(struct psquare *psq)
+{
+  ucnum c = psq->count;
+  double est;
+
+  if (c > 2 && c < 5)
+    qsort(psq->q, c, sizeof psq->q[0], dbl_cmp);
+
+  switch (psq->count) {
+  case 0:
+    est = 0.0;
+    break;
+  case 1:
+    est = psq->q[0];
+    break;
+  case 2:
+    est = psq->q[0] + (psq->q[1] - psq->q[0]) / 2;
+    break;
+  case 3:
+    est = psq->q[1];
+    break;
+  case 4:
+    est = psq->q[1] + (psq->q[2] - psq->q[1]) / 2;
+    break;
+  default:
+    est = psq->q[2];
+    break;
+  }
+
+  if (!psq->blending)
+    return est;
+
+  return psq->blend * psq->prev + (1 - psq->blend) * est;
+}
diff --git a/psquare.h b/psquare.h
new file mode 100644
index 00000000..77580e02
--- /dev/null
+++ b/psquare.h
@@ -0,0 +1,54 @@
+/* Copyright 2021
+ * Kaz Kylheku <kaz@kylheku.com>
+ * Vancouver, Canada
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+enum psq_type {
+  psq_regular,
+  psq_grouped
+};
+
+struct psquare {
+  enum psq_type type;
+  double p;
+  double q[5];
+  double wn[5];
+  double dn[5];
+  cnum n[5];
+  ucnum count;
+  /* psq_grouped fields */
+  ucnum grsize;
+  double prev;
+  double rate;
+  double blend;
+  int blending;
+};
+
+void psq_init(struct psquare *, double p);
+void psq_init_grouped(struct psquare *, double p,
+                      ucnum grsize, double rate);
+void psq_add_sample(struct psquare *, double s, val self);
+double psq_get_estimate(struct psquare *);
diff --git a/stdlib/doc-syms.tl b/stdlib/doc-syms.tl
index 94b2cf89..60a81501 100644
--- a/stdlib/doc-syms.tl
+++ b/stdlib/doc-syms.tl
@@ -1503,6 +1503,7 @@
       ("pwd" "N-0047F5F6")
       ("qquote" "N-01665185")
       ("qref" "D-006E")
+      ("quantile" "N-0318C018")
       ("quip" "N-03C6D422")
       ("quote" "N-0163F998")
       ("r$" "N-03BBB0C5")
diff --git a/tests/016/arith.tl b/tests/016/arith.tl
index be0d512b..34a82c7f 100644
--- a/tests/016/arith.tl
+++ b/tests/016/arith.tl
@@ -222,3 +222,34 @@
   (< '(1 2 3) #(1 2 3.0)) nil
   (< '(1 2 3) #(1 2 3 4)) t
   (< '(1 2 3) #(1 2 4)) t)
+
+(test
+  (let ((q (quantile 0.5)))
+    [q 0.02 0.5 0.74 3.39 0.83]
+    [mapcar q '(22.37 10.15 15.43 38.62 15.92
+                34.60 10.28 1.47 0.40 0.05 11.39
+                0.27 0.42 0.09 11.37)])
+  (0.73999999999999999 0.73999999999999999 2.0616666666666665
+   4.5517592592592591 4.5517592592592591 9.1519618055555547
+   9.1519618055555547 9.1519618055555547 9.1519618055555547
+   6.1797614914021164 6.1797614914021164 6.1797614914021164
+   6.1797614914021164 4.2462394088036444 4.2462394088036444))
+
+(test
+  (let ((q (quantile 0)))
+    (cons [q] [mapcar q '(1 2 3 4 5)]))
+  (0.0 1.0 1.5 2.0 2.5 3.0))
+
+(test
+  (let ((q (quantile 0 5 0.5)))
+    [mapcar q '(1.0 2.0 3.0 4.0 5.0
+                0.0 0.0 0.0 0.0 0.0)])
+  (1.0 1.5 2.0 2.5 3.0
+   1.5 0.75 0.375 0.1875 0.09375))
+
+(test
+  (let ((q (quantile 0 5 0.5)))
+    [mapcar q '(0.0 0.0 0.0 0.0 0.0
+                3.0 3.0 3.0 3.0 3.0)])
+  (0.0 0.0 0.0 0.0 0.0
+   1.5 2.25 2.625 2.8125 2.90625))
diff --git a/txr.1 b/txr.1
index 498f9605..8281de4a 100644
--- a/txr.1
+++ b/txr.1
@@ -46577,6 +46577,149 @@ benefits from ordering the operations on multiple integer operands
 according to the magnitudes of those operands. The function provides an
 estimate of magnitude which trades accuracy for efficiency.
 
+.coNP Function @ quantile
+.synb
+.mets (quantile < p >> [ group-size <> [ rate ]])
+.syne
+.desc
+The
+.code quantile
+function returns a function which estimates a specific quantile
+of a set of real-valued samples. The desired quantile is indicated
+by the
+.meta p
+parameter, which is a number in the range 0 to 1.0. If
+.meta p
+is specified as 0.5, then the median is estimated.
+The
+.meta p
+value of 0.9 leads to the estimation of the 90th percentile:
+a value such that approximately 90% of the samples are below that value.
+
+If the
+.meta group-size
+parameter is specified, it must be a positive integer.
+The returned function then operates in grouped mode. The
+.meta rate
+parameter is relevant only to grouped mode. Grouped mode is
+described below.
+
+The function returned by
+.code quantile
+maintains internal state in relation to calculating the quantile.
+The function may be called with any number of arguments, including
+none. It expects every argument to be either a number, or a sequence
+of numbers. These numbers are accumulated into the quantile calculation,
+and a revised estimate of the quantile is then returned.
+
+Note: the algorithm used is the P-Squared algorithm invented in 1985 by Raj
+Jain and Imrich Chlamtac, which avoids accumulating and sorting the entire data
+set, while still obtaining good quality estimates of the quantile.
+The algorithm requires an initial seed of five samples. Then additional
+samples input into the algorithm produce quantile estimates.  To eliminate this
+special case from the abstract interface, the \*(TX implementation is capable of
+producing an estimate when five or fewer samples have been presented, including
+none. In this low situation, the
+.meta p
+value is ignored in reporting the estimate.  When no samples have been given,
+the estimate is zero.  When one sample has been given, the estimate is that
+sample itself. When between two and five samples have been given, the estimate
+is their median. Using the median as the estimate ensures a smooth transition
+from these early estimates into the estimates produced by the P-Squared
+algorithm. This is because the P-Squared algorithm always reports the value of
+the middle height accumulator as the estimate, and that accumulator's initial
+value is the median of the first five samples.
+
+The function returned by
+.codn quantile ,
+though not accumulating all of the samples passed to it, nevertheless has
+a limited sample capacity, because the registers it uses for tracking the
+sample group positions are fixed-width integers. The sample capacity is
+approximately 4 times the value of
+.codn fixnum-max .
+
+.TP* Example:
+
+.verb
+   (defparm q (quantile 0.9))   ;; create 90-th percentile accumulator
+
+   [q] -> 0.0                   ;; no samples given: estimate is 0.
+   [q 3.14] -> 3.14             ;; one sample: estimate is that sample
+   [q 13.3 7.9 5.2 6.3] -> 7.9  ;; five samples: estimate is median.
+   [q 6.8 7.3 9.1 4.0]          ;; more than five samples; estimate now
+   -> 8.44651234567901          ;; from P-Square algorithm
+   [q #(13.1 5 2.5)]            ;; vector argument
+   -> 9.68660493827161
+   [q] -> 9.68660493827161      ;; no arguments: repeat current estimate
+.brev
+
+If the
+.meta group-size
+argument is specified, then the quantile accumulator operates in grouped mode.
+Grouped mode allows infinite sample operation without overflow: an unlimited
+number of samples can be accepted. However, old samples lose their influence
+over the estimated value: newer samples are considered more significant than
+old samples.
+
+In grouped mode, the quantile accumulator is reset to its initial state whenever
+.meta group-size
+samples have been accumulated, and begins freshly calculating the quantile.
+Prior to the reset, an estimate is obtained and retained in an internal
+register. Going forward, this remembered previous estimate is blended in with
+the newly calculated estimate values, as described below.  The cycle repeats
+itself whenever
+.meta group-size
+samples accumulate: the state is reset, and the current estimate is loaded into
+the previous estimate register, which is then blended with newly computed
+values.
+
+The
+.meta rate
+parameter, whose default value is 0.999, controls the estimate blending.
+It should be a value between 0 and 1.
+
+Upon each reset, a blend value register is initialized to 1.0. Each time
+a new sample is accumulated, the blend register is multiplied by the rate
+parameter, and the product is stored back into the blend register.
+Thus if the rate is between 0 and 1, exclusive, then the blend register
+exponentially decreases as the number of samples grows. The blend register
+indicates the fraction of the estimate which comes from the remembered previous
+estimate.
+
+For instance, if the current blend value is 0.8, then the returned estimate
+value is 0.8 times the remembered previous estimate, plus 0.2 times the newly
+computed estimate for the current sample in the new group: the previous and
+current estimate are blended 80:20.
+
+The default
+.meta rate
+value of 0.999 is chosen for a slow transition to the new estimates, which
+helps to conceal inaccuracies in the algorithm associated with having
+accumulated a small number of samples. At this rate, it requires about 290
+samples before the blend value drops to 75% of the old estimate.
+
+If
+.code rate
+is specified as 0, then no blending of the previous estimate value
+takes place, since the blend factor will drop to zero upon the first
+sample being received after the group reset, causing the newly calculated
+estimates to be returned without blending. The previous sample groups
+therefore have no influence over newer estimates. If
+.code rate
+is specified as 1, then the blend factor will stay at 1, and so the
+estimate will forever remain at the previous value, ignoring the
+calculations driven by the new samples.
+
+Note: it is recommended that if
+.meta group-size
+is specified, the value should be at least several hundred. Too small
+a group size will prevent the estimation algorithm from settling on
+good results. The
+.meta rate
+parameter should not much smaller than 1. A rate too low will cause
+the previous estimate's contribution to the quantile value to diminish,
+too quickly, before the new estimation settles.
+
 .coNP Variables @, flo-near @, flo-down @ flo-up and @ flo-zero
 .desc
 These variables hold integer values suitable as arguments to the