diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2012-03-13 12:57:21 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2012-03-13 12:57:21 -0700 |
commit | 5961f0de80abce4645ec2f022b2346e24b6479ed (patch) | |
tree | 2ae5ac4b66170ddf43bbc968f1160cd9aa9f2be9 | |
parent | aa62864118b755f10b30fd58a3b7fd8407ce8c6c (diff) | |
download | txr-5961f0de80abce4645ec2f022b2346e24b6479ed.tar.gz txr-5961f0de80abce4645ec2f022b2346e24b6479ed.tar.bz2 txr-5961f0de80abce4645ec2f022b2346e24b6479ed.zip |
Implementing URL filtering.
* eval.c (eval_init): New intrinsic functions: url-encode, url-decode.
* filter.c (tourl_k, fromurl_k): New keyword variables.
(is_url_reserved, digit_value): New static functions.
(url_encode, url_decode): New functions.
(filter_init): Intialize new keyword variables and register
new :tourl and :fromurl filters.
* filter.h (tourl_k, fromurl_k, url_encode, url_decode): Declared.
* txr.1: Updated.
* txr.vim: Likewise.
-rw-r--r-- | ChangeLog | 18 | ||||
-rw-r--r-- | eval.c | 4 | ||||
-rw-r--r-- | filter.c | 79 | ||||
-rw-r--r-- | filter.h | 5 | ||||
-rw-r--r-- | txr.1 | 21 | ||||
-rw-r--r-- | txr.vim | 6 |
6 files changed, 131 insertions, 2 deletions
@@ -1,5 +1,23 @@ 2012-03-13 Kaz Kylheku <kaz@kylheku.com> + Implementing URL filtering. + + * eval.c (eval_init): New intrinsic functions: url-encode, url-decode. + + * filter.c (tourl_k, fromurl_k): New keyword variables. + (is_url_reserved, digit_value): New static functions. + (url_encode, url_decode): New functions. + (filter_init): Intialize new keyword variables and register + new :tourl and :fromurl filters. + + * filter.h (tourl_k, fromurl_k, url_encode, url_decode): Declared. + + * txr.1: Updated. + + * txr.vim: Likewise. + +2012-03-13 Kaz Kylheku <kaz@kylheku.com> + * stream.c (string_out_byte_flush): Bugfix. Do not loop inside this function. This must not flush out more than one character out of this small buffer, except when we are flushing out the last data. @@ -43,6 +43,7 @@ #include "debug.h" #include "match.h" #include "rand.h" +#include "filter.h" #include "eval.h" typedef val (*opfun_t)(val, val); @@ -2354,6 +2355,9 @@ void eval_init(void) reg_fun(intern(lit("match-fun"), user_package), func_n4(match_fun)); + reg_fun(intern(lit("url-encode"), user_package), func_n1(url_encode)); + reg_fun(intern(lit("url-decode"), user_package), func_n1(url_decode)); + eval_error_s = intern(lit("eval-error"), user_package); uw_register_subtype(eval_error_s, error_s); } @@ -26,8 +26,11 @@ #include <stddef.h> #include <setjmp.h> +#include <string.h> #include <wctype.h> #include <wchar.h> +#include <stdarg.h> +#include <dirent.h> #include "config.h" #include "lib.h" #include "hash.h" @@ -35,10 +38,12 @@ #include "match.h" #include "filter.h" #include "gc.h" +#include "stream.h" val filters; val filter_k, lfilt_k, rfilt_k, to_html_k, from_html_k; val upcase_k, downcase_k, fun_k; +val tourl_k, fromurl_k; static val make_trie(void) { @@ -573,6 +578,75 @@ static val html_numeric_handler(val ch) return func_f1(cons(ch, nil), html_dec_continue); } +static int is_url_reserved(int ch) +{ + return (ch <= 0x20 || ch >= 0x7F || strchr(":/?#[]@!$&'()*+,;=%", ch) != 0); +} + +val url_encode(val str) +{ + val in_byte = make_string_byte_input_stream(str); + val out = make_string_output_stream(); + val ch; + + while ((ch = get_byte(in_byte)) != nil) { + int c = c_num(ch); + + if (is_url_reserved(c)) + format(out, lit("%~1X~1X"), num_fast(c >> 4), num_fast(c & 0xf), nao); + else + put_char(chr_num(ch), out); + } + + return get_string_from_stream(out); +} + +static int digit_value(int digit) +{ + if (digit >= '0' && digit <= '9') + return digit - '0'; + if (digit >= 'A' && digit <= 'F') + return digit - 'A' + 10; + if (digit >= 'a' && digit <= 'f') + return digit - 'a' + 10; + internal_error("bad digit"); +} + +val url_decode(val str) +{ + val in = make_string_input_stream(str); + val out = make_string_output_stream(); + + for (;;) { + val ch = get_char(in); + + if (ch == chr('%')) { + val ch2 = get_char(in); + val ch3 = get_char(in); + + if (ch2 && ch3 && chr_isxdigit(ch2) && chr_isxdigit(ch3)) { + int byte = digit_value(c_num(ch2)) << 4 | digit_value(c_num(ch3)); + put_byte(num_fast(byte), out); + } else { + put_char(ch, out); + if (!ch2) + break; + put_char(ch2, out); + if (!ch3) + break; + put_char(ch3, out); + } + continue; + } + if (!ch) + break; + + put_char(ch, out); + } + + return get_string_from_stream(out); +} + void filter_init(void) { protect(&filters, (val *) 0); @@ -586,6 +660,9 @@ void filter_init(void) upcase_k = intern(lit("upcase"), keyword_package); downcase_k = intern(lit("downcase"), keyword_package); fun_k = intern(lit("fun"), keyword_package); + tourl_k = intern(lit("tourl"), keyword_package); + fromurl_k = intern(lit("fromurl"), keyword_package); + sethash(filters, to_html_k, build_filter(to_html_table, t)); { val trie = build_filter(from_html_table, nil); @@ -595,4 +672,6 @@ void filter_init(void) } sethash(filters, upcase_k, func_n1(upcase_str)); sethash(filters, downcase_k, func_n1(downcase_str)); + sethash(filters, tourl_k, func_n1(url_encode)); + sethash(filters, fromurl_k, func_n1(url_decode)); } @@ -27,6 +27,7 @@ extern val filters; extern val filter_k, lfilt_k, rfilt_k, to_html_k, from_html_k; extern val upcase_k, downcase_k, fun_k; +extern val tourl_k, fromurl_k; val trie_lookup_begin(val trie); val trie_value_at(val node); @@ -36,4 +37,8 @@ val filter_string(val trie, val str); val filter_equal(val lfilt, val rfilt, val left, val right); val register_filter(val sym, val table); +val url_encode(val str); +val url_decode(val str); + void filter_init(void); + @@ -3638,6 +3638,25 @@ Convert the 26 lower case letters of the English alphabet to upper case. .IP :downcase Convert the 26 upper case letters of the English alphabet to lower case. +.IP :fromurl +Decode URL-encoded (a.k.a. percent-encoded) text. Character triplets consisting +of the % character followed by a pair of hexadecimal digits (case insensitive) +are are converted to bytes having the value represented by the hexadecimal +digits (most significant nybble first). Sequences of one or more such bytes are +treated as UTF-8 data and decoded to characters. + +.IP :tourl +Convert to URL encoding according to RFC 3986. The text is first converted +to UTF-8 bytes. The bytes are then converted back to text as follows. +Bytes in the range 0 to 32, and 127 to 255 (note: including the ASCII DEL), +bytes whose values correspond to ASCII characters which are listed by RFC 3986 +as being in the "reserved set", and the byte value corresponding to the +ASCII % character are encoded as a three-character sequence consisting +of the % character followed by two hexadecimal digits derived from the +byte value (most significant nybble first, upper case). All other bytes +are converted directly to characters of the same value without any such +encoding. + Example: to escape HTML characters in all variable substitutions occuring in an output clause, specify :filter :to_html in the directive: @@ -6754,6 +6773,8 @@ Certain object types have a custom equal function. .SS Function match-fun +.SS Functions url-encode and url-decode + .SH APPENDIX A: NOTES ON EXOTIC REGULAR EXPRESSIONS Users familiar with regular expressions may not be familiar with the complement @@ -45,7 +45,9 @@ syn keyword txl_keyword contained remq remql remqual syn keyword txl_keyword contained all none eq eql equal + - * abs trunc mod syn keyword txl_keyword contained expt exptmod sqrt gcd fixnump bignump syn keyword txl_keyword contained numberp zerop evenp oddp > -syn keyword txl_keyword contained < >= <= max min search-regex match-regex regsub + +.SS Functions url-encode and url-decode + syn keyword txl_keyword contained make-hash hash gethash sethash pushhash remhash syn keyword txl_keyword contained hash-count get-hash-userdata set-hash-userdata hashp maphash syn keyword txl_keyword contained hash-eql hash-equal eval *stdout* *stdin* *stddebug* @@ -83,7 +85,7 @@ syn keyword txl_keyword contained make-random-state random-state-p syn keyword txl_keyword contained random-fixnum random rand syn keyword txl_keyword contained range range* generate repeat force -syn keyword txl_keyword contained throw throwf error match-fun +syn keyword txl_keyword contained throw throwf error match-fun url-encode url-decode syn match txr_error "@[\t ]*[*]\?[\t ]*." syn match txr_nested_error "[^\t `]\+" contained |