summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-03-13 12:57:21 -0700
committerKaz Kylheku <kaz@kylheku.com>2012-03-13 12:57:21 -0700
commit5961f0de80abce4645ec2f022b2346e24b6479ed (patch)
tree2ae5ac4b66170ddf43bbc968f1160cd9aa9f2be9
parentaa62864118b755f10b30fd58a3b7fd8407ce8c6c (diff)
downloadtxr-5961f0de80abce4645ec2f022b2346e24b6479ed.tar.gz
txr-5961f0de80abce4645ec2f022b2346e24b6479ed.tar.bz2
txr-5961f0de80abce4645ec2f022b2346e24b6479ed.zip
Implementing URL filtering.
* eval.c (eval_init): New intrinsic functions: url-encode, url-decode. * filter.c (tourl_k, fromurl_k): New keyword variables. (is_url_reserved, digit_value): New static functions. (url_encode, url_decode): New functions. (filter_init): Intialize new keyword variables and register new :tourl and :fromurl filters. * filter.h (tourl_k, fromurl_k, url_encode, url_decode): Declared. * txr.1: Updated. * txr.vim: Likewise.
-rw-r--r--ChangeLog18
-rw-r--r--eval.c4
-rw-r--r--filter.c79
-rw-r--r--filter.h5
-rw-r--r--txr.121
-rw-r--r--txr.vim6
6 files changed, 131 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index 8298ec3e..807896fb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,23 @@
2012-03-13 Kaz Kylheku <kaz@kylheku.com>
+ Implementing URL filtering.
+
+ * eval.c (eval_init): New intrinsic functions: url-encode, url-decode.
+
+ * filter.c (tourl_k, fromurl_k): New keyword variables.
+ (is_url_reserved, digit_value): New static functions.
+ (url_encode, url_decode): New functions.
+ (filter_init): Intialize new keyword variables and register
+ new :tourl and :fromurl filters.
+
+ * filter.h (tourl_k, fromurl_k, url_encode, url_decode): Declared.
+
+ * txr.1: Updated.
+
+ * txr.vim: Likewise.
+
+2012-03-13 Kaz Kylheku <kaz@kylheku.com>
+
* stream.c (string_out_byte_flush): Bugfix. Do not loop inside this
function. This must not flush out more than one character out of this
small buffer, except when we are flushing out the last data.
diff --git a/eval.c b/eval.c
index c527d9de..156e58e2 100644
--- a/eval.c
+++ b/eval.c
@@ -43,6 +43,7 @@
#include "debug.h"
#include "match.h"
#include "rand.h"
+#include "filter.h"
#include "eval.h"
typedef val (*opfun_t)(val, val);
@@ -2354,6 +2355,9 @@ void eval_init(void)
reg_fun(intern(lit("match-fun"), user_package), func_n4(match_fun));
+ reg_fun(intern(lit("url-encode"), user_package), func_n1(url_encode));
+ reg_fun(intern(lit("url-decode"), user_package), func_n1(url_decode));
+
eval_error_s = intern(lit("eval-error"), user_package);
uw_register_subtype(eval_error_s, error_s);
}
diff --git a/filter.c b/filter.c
index 9dd07e11..f66420ba 100644
--- a/filter.c
+++ b/filter.c
@@ -26,8 +26,11 @@
#include <stddef.h>
#include <setjmp.h>
+#include <string.h>
#include <wctype.h>
#include <wchar.h>
+#include <stdarg.h>
+#include <dirent.h>
#include "config.h"
#include "lib.h"
#include "hash.h"
@@ -35,10 +38,12 @@
#include "match.h"
#include "filter.h"
#include "gc.h"
+#include "stream.h"
val filters;
val filter_k, lfilt_k, rfilt_k, to_html_k, from_html_k;
val upcase_k, downcase_k, fun_k;
+val tourl_k, fromurl_k;
static val make_trie(void)
{
@@ -573,6 +578,75 @@ static val html_numeric_handler(val ch)
return func_f1(cons(ch, nil), html_dec_continue);
}
+static int is_url_reserved(int ch)
+{
+ return (ch <= 0x20 || ch >= 0x7F || strchr(":/?#[]@!$&'()*+,;=%", ch) != 0);
+}
+
+val url_encode(val str)
+{
+ val in_byte = make_string_byte_input_stream(str);
+ val out = make_string_output_stream();
+ val ch;
+
+ while ((ch = get_byte(in_byte)) != nil) {
+ int c = c_num(ch);
+
+ if (is_url_reserved(c))
+ format(out, lit("%~1X~1X"), num_fast(c >> 4), num_fast(c & 0xf), nao);
+ else
+ put_char(chr_num(ch), out);
+ }
+
+ return get_string_from_stream(out);
+}
+
+static int digit_value(int digit)
+{
+ if (digit >= '0' && digit <= '9')
+ return digit - '0';
+ if (digit >= 'A' && digit <= 'F')
+ return digit - 'A' + 10;
+ if (digit >= 'a' && digit <= 'f')
+ return digit - 'a' + 10;
+ internal_error("bad digit");
+}
+
+val url_decode(val str)
+{
+ val in = make_string_input_stream(str);
+ val out = make_string_output_stream();
+
+ for (;;) {
+ val ch = get_char(in);
+
+ if (ch == chr('%')) {
+ val ch2 = get_char(in);
+ val ch3 = get_char(in);
+
+ if (ch2 && ch3 && chr_isxdigit(ch2) && chr_isxdigit(ch3)) {
+ int byte = digit_value(c_num(ch2)) << 4 | digit_value(c_num(ch3));
+ put_byte(num_fast(byte), out);
+ } else {
+ put_char(ch, out);
+ if (!ch2)
+ break;
+ put_char(ch2, out);
+ if (!ch3)
+ break;
+ put_char(ch3, out);
+ }
+ continue;
+ }
+ if (!ch)
+ break;
+
+ put_char(ch, out);
+ }
+
+ return get_string_from_stream(out);
+}
+
void filter_init(void)
{
protect(&filters, (val *) 0);
@@ -586,6 +660,9 @@ void filter_init(void)
upcase_k = intern(lit("upcase"), keyword_package);
downcase_k = intern(lit("downcase"), keyword_package);
fun_k = intern(lit("fun"), keyword_package);
+ tourl_k = intern(lit("tourl"), keyword_package);
+ fromurl_k = intern(lit("fromurl"), keyword_package);
+
sethash(filters, to_html_k, build_filter(to_html_table, t));
{
val trie = build_filter(from_html_table, nil);
@@ -595,4 +672,6 @@ void filter_init(void)
}
sethash(filters, upcase_k, func_n1(upcase_str));
sethash(filters, downcase_k, func_n1(downcase_str));
+ sethash(filters, tourl_k, func_n1(url_encode));
+ sethash(filters, fromurl_k, func_n1(url_decode));
}
diff --git a/filter.h b/filter.h
index a3dcab2f..1a084e66 100644
--- a/filter.h
+++ b/filter.h
@@ -27,6 +27,7 @@
extern val filters;
extern val filter_k, lfilt_k, rfilt_k, to_html_k, from_html_k;
extern val upcase_k, downcase_k, fun_k;
+extern val tourl_k, fromurl_k;
val trie_lookup_begin(val trie);
val trie_value_at(val node);
@@ -36,4 +37,8 @@ val filter_string(val trie, val str);
val filter_equal(val lfilt, val rfilt, val left, val right);
val register_filter(val sym, val table);
+val url_encode(val str);
+val url_decode(val str);
+
void filter_init(void);
+
diff --git a/txr.1 b/txr.1
index fbc61d4d..e5e63478 100644
--- a/txr.1
+++ b/txr.1
@@ -3638,6 +3638,25 @@ Convert the 26 lower case letters of the English alphabet to upper case.
.IP :downcase
Convert the 26 upper case letters of the English alphabet to lower case.
+.IP :fromurl
+Decode URL-encoded (a.k.a. percent-encoded) text. Character triplets consisting
+of the % character followed by a pair of hexadecimal digits (case insensitive)
+are are converted to bytes having the value represented by the hexadecimal
+digits (most significant nybble first). Sequences of one or more such bytes are
+treated as UTF-8 data and decoded to characters.
+
+.IP :tourl
+Convert to URL encoding according to RFC 3986. The text is first converted
+to UTF-8 bytes. The bytes are then converted back to text as follows.
+Bytes in the range 0 to 32, and 127 to 255 (note: including the ASCII DEL),
+bytes whose values correspond to ASCII characters which are listed by RFC 3986
+as being in the "reserved set", and the byte value corresponding to the
+ASCII % character are encoded as a three-character sequence consisting
+of the % character followed by two hexadecimal digits derived from the
+byte value (most significant nybble first, upper case). All other bytes
+are converted directly to characters of the same value without any such
+encoding.
+
Example: to escape HTML characters in all variable substitutions occuring in an
output clause, specify :filter :to_html in the directive:
@@ -6754,6 +6773,8 @@ Certain object types have a custom equal function.
.SS Function match-fun
+.SS Functions url-encode and url-decode
+
.SH APPENDIX A: NOTES ON EXOTIC REGULAR EXPRESSIONS
Users familiar with regular expressions may not be familiar with the complement
diff --git a/txr.vim b/txr.vim
index f59de0fa..edecc5aa 100644
--- a/txr.vim
+++ b/txr.vim
@@ -45,7 +45,9 @@ syn keyword txl_keyword contained remq remql remqual
syn keyword txl_keyword contained all none eq eql equal + - * abs trunc mod
syn keyword txl_keyword contained expt exptmod sqrt gcd fixnump bignump
syn keyword txl_keyword contained numberp zerop evenp oddp >
-syn keyword txl_keyword contained < >= <= max min search-regex match-regex regsub
+
+.SS Functions url-encode and url-decode
+
syn keyword txl_keyword contained make-hash hash gethash sethash pushhash remhash
syn keyword txl_keyword contained hash-count get-hash-userdata set-hash-userdata hashp maphash
syn keyword txl_keyword contained hash-eql hash-equal eval *stdout* *stdin* *stddebug*
@@ -83,7 +85,7 @@ syn keyword txl_keyword contained make-random-state random-state-p
syn keyword txl_keyword contained random-fixnum random rand
syn keyword txl_keyword contained range range* generate repeat force
-syn keyword txl_keyword contained throw throwf error match-fun
+syn keyword txl_keyword contained throw throwf error match-fun url-encode url-decode
syn match txr_error "@[\t ]*[*]\?[\t ]*."
syn match txr_nested_error "[^\t `]\+" contained