From 8a02974787d66fbf7c92380bab4cac4418fcfa16 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Sun, 12 Apr 2015 16:01:03 -0400 Subject: [PATCH] Add uri_normalize function --- README.md | 16 ++++++++++ test/expected/test.out | 66 ++++++++++++++++++++++++++++++++++++++++-- test/sql/test.sql | 7 +++++ uri.c | 29 +++++++++++++++++++ uri.sql | 7 +++++ 5 files changed, 123 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5ec7e07..4697d87 100644 --- a/README.md +++ b/README.md @@ -110,3 +110,19 @@ A number of functions are provided to extract parts of a URI: Extracts the fragment part of a URI (roughly speaking, everything after the `#`). If there is no fragment part, returns null. + +Other functions: + +- `uri_normalize(uri) returns uri` + + Performs syntax-based normalization of the URI. This includes + case normalization, percent-encoding normalization, and removing + redundant `.` and `..` path segments. See + [RFC 3986 section 6.2.2](http://tools.ietf.org/html/rfc3986#section-6.2.2) + for the full details. + + Note that this module (and similar modules in other programming + languages) compares URIs for equality in their original form, + without normalization. If you want to consider distinct URIs + without regard for mostly irrelevant syntax differences, pass them + through this function. diff --git a/test/expected/test.out b/test/expected/test.out index 26a7588..b9fb392 100644 --- a/test/expected/test.out +++ b/test/expected/test.out @@ -16,6 +16,11 @@ VALUES ('http://www.postgresql.org/'), ('/'), ('foobar'), ('/foobar'); +-- normalization test values from +INSERT INTO test (b) +VALUES ('HTTP://www.EXAMPLE.com/'), + ('http://www.ex%41mple.com/'), + ('eXAMPLE://a/./b/../b/%63/%7bfoo%7d'); SELECT * FROM test; a | b ----+----------------------------------------------------------------------------------------- @@ -33,7 +38,10 @@ SELECT * FROM test; 12 | / 13 | foobar 14 | /foobar -(14 rows) + 15 | HTTP://www.EXAMPLE.com/ + 16 | http://www.ex%41mple.com/ + 17 | eXAMPLE://a/./b/../b/%63/%7bfoo%7d +(17 rows) -- error cases SELECT uri 'http://host:port/'; @@ -42,6 +50,7 @@ LINE 1: SELECT uri 'http://host:port/'; ^ \x on SELECT b AS uri, + uri_normalize(b), uri_scheme(b), uri_userinfo(b), uri_host(b), @@ -54,6 +63,7 @@ SELECT b AS uri, FROM test; -[ RECORD 1 ]--+---------------------------------------------------------------------------------------- uri | http://www.postgresql.org/ +uri_normalize | http://www.postgresql.org/ uri_scheme | http uri_userinfo | _null_ uri_host | www.postgresql.org @@ -65,6 +75,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 2 ]--+---------------------------------------------------------------------------------------- uri | http://www.postgresql.org/docs/devel/static/xfunc-sql.html#XFUNC-SQL-FUNCTION-ARGUMENTS +uri_normalize | http://www.postgresql.org/docs/devel/static/xfunc-sql.html#XFUNC-SQL-FUNCTION-ARGUMENTS uri_scheme | http uri_userinfo | _null_ uri_host | www.postgresql.org @@ -76,6 +87,7 @@ uri_query | _null_ uri_fragment | XFUNC-SQL-FUNCTION-ARGUMENTS -[ RECORD 3 ]--+---------------------------------------------------------------------------------------- uri | https://duckduckgo.com/?q=postgresql&ia=about +uri_normalize | https://duckduckgo.com/?q=postgresql&ia=about uri_scheme | https uri_userinfo | _null_ uri_host | duckduckgo.com @@ -87,6 +99,7 @@ uri_query | q=postgresql&ia=about uri_fragment | _null_ -[ RECORD 4 ]--+---------------------------------------------------------------------------------------- uri | ftp://ftp.gnu.org/gnu/bison +uri_normalize | ftp://ftp.gnu.org/gnu/bison uri_scheme | ftp uri_userinfo | _null_ uri_host | ftp.gnu.org @@ -98,6 +111,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 5 ]--+---------------------------------------------------------------------------------------- uri | mailto:foo@example.com +uri_normalize | mailto:foo@example.com uri_scheme | mailto uri_userinfo | _null_ uri_host | _null_ @@ -109,6 +123,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 6 ]--+---------------------------------------------------------------------------------------- uri | ssh://username@review.openstack.org:29418/openstack/nova.git +uri_normalize | ssh://username@review.openstack.org:29418/openstack/nova.git uri_scheme | ssh uri_userinfo | username uri_host | review.openstack.org @@ -120,6 +135,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 7 ]--+---------------------------------------------------------------------------------------- uri | http://admin:password@192.168.0.1 +uri_normalize | http://admin:password@192.168.0.1 uri_scheme | http uri_userinfo | admin:password uri_host | 192.168.0.1 @@ -131,6 +147,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 8 ]--+---------------------------------------------------------------------------------------- uri | http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html +uri_normalize | http://[fedc:ba98:7654:3210:fedc:ba98:7654:3210]:80/index.html uri_scheme | http uri_userinfo | _null_ uri_host | FEDC:BA98:7654:3210:FEDC:BA98:7654:3210 @@ -142,6 +159,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 9 ]--+---------------------------------------------------------------------------------------- uri | http://[1080::8:800:200C:417A]/foo +uri_normalize | http://[1080:0000:0000:0000:0008:0800:200c:417a]/foo uri_scheme | http uri_userinfo | _null_ uri_host | 1080::8:800:200C:417A @@ -153,6 +171,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 10 ]-+---------------------------------------------------------------------------------------- uri | http://host: +uri_normalize | http://host: uri_scheme | http uri_userinfo | _null_ uri_host | host @@ -164,6 +183,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 11 ]-+---------------------------------------------------------------------------------------- uri | +uri_normalize | uri_scheme | _null_ uri_userinfo | _null_ uri_host | _null_ @@ -175,6 +195,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 12 ]-+---------------------------------------------------------------------------------------- uri | / +uri_normalize | / uri_scheme | _null_ uri_userinfo | _null_ uri_host | _null_ @@ -186,6 +207,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 13 ]-+---------------------------------------------------------------------------------------- uri | foobar +uri_normalize | foobar uri_scheme | _null_ uri_userinfo | _null_ uri_host | _null_ @@ -197,6 +219,7 @@ uri_query | _null_ uri_fragment | _null_ -[ RECORD 14 ]-+---------------------------------------------------------------------------------------- uri | /foobar +uri_normalize | /foobar uri_scheme | _null_ uri_userinfo | _null_ uri_host | _null_ @@ -206,6 +229,42 @@ uri_path | /foobar uri_path_array | {foobar} uri_query | _null_ uri_fragment | _null_ +-[ RECORD 15 ]-+---------------------------------------------------------------------------------------- +uri | HTTP://www.EXAMPLE.com/ +uri_normalize | http://www.example.com/ +uri_scheme | HTTP +uri_userinfo | _null_ +uri_host | www.EXAMPLE.com +uri_host_inet | _null_ +uri_port | _null_ +uri_path | / +uri_path_array | {""} +uri_query | _null_ +uri_fragment | _null_ +-[ RECORD 16 ]-+---------------------------------------------------------------------------------------- +uri | http://www.ex%41mple.com/ +uri_normalize | http://www.example.com/ +uri_scheme | http +uri_userinfo | _null_ +uri_host | www.ex%41mple.com +uri_host_inet | _null_ +uri_port | _null_ +uri_path | / +uri_path_array | {""} +uri_query | _null_ +uri_fragment | _null_ +-[ RECORD 17 ]-+---------------------------------------------------------------------------------------- +uri | eXAMPLE://a/./b/../b/%63/%7bfoo%7d +uri_normalize | example://a/b/c/%7Bfoo%7D +uri_scheme | eXAMPLE +uri_userinfo | _null_ +uri_host | a +uri_host_inet | _null_ +uri_port | _null_ +uri_path | /./b/../b/%63/%7bfoo%7d +uri_path_array | {.,b,..,b,%63,%7bfoo%7d} +uri_query | _null_ +uri_fragment | _null_ \x off SELECT DISTINCT b FROM test ORDER BY b; @@ -214,16 +273,19 @@ SELECT DISTINCT b FROM test ORDER BY b; / /foobar + HTTP://www.EXAMPLE.com/ + eXAMPLE://a/./b/../b/%63/%7bfoo%7d foobar ftp://ftp.gnu.org/gnu/bison http://[1080::8:800:200C:417A]/foo http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html http://admin:password@192.168.0.1 http://host: + http://www.ex%41mple.com/ http://www.postgresql.org/ http://www.postgresql.org/docs/devel/static/xfunc-sql.html#XFUNC-SQL-FUNCTION-ARGUMENTS https://duckduckgo.com/?q=postgresql&ia=about mailto:foo@example.com ssh://username@review.openstack.org:29418/openstack/nova.git -(14 rows) +(17 rows) diff --git a/test/sql/test.sql b/test/sql/test.sql index 67cf1f4..abe6e23 100644 --- a/test/sql/test.sql +++ b/test/sql/test.sql @@ -20,6 +20,12 @@ VALUES ('http://www.postgresql.org/'), ('foobar'), ('/foobar'); +-- normalization test values from +INSERT INTO test (b) +VALUES ('HTTP://www.EXAMPLE.com/'), + ('http://www.ex%41mple.com/'), + ('eXAMPLE://a/./b/../b/%63/%7bfoo%7d'); + SELECT * FROM test; -- error cases @@ -28,6 +34,7 @@ SELECT uri 'http://host:port/'; \x on SELECT b AS uri, + uri_normalize(b), uri_scheme(b), uri_userinfo(b), uri_host(b), diff --git a/uri.c b/uri.c index 5c7901c..a2baeaa 100644 --- a/uri.c +++ b/uri.c @@ -277,6 +277,35 @@ uri_path_array(PG_FUNCTION_ARGS) PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID)); } +PG_FUNCTION_INFO_V1(uri_normalize); +Datum +uri_normalize(PG_FUNCTION_ARGS) +{ + Datum arg = PG_GETARG_DATUM(0); + char *s = TextDatumGetCString(arg); + UriUriA uri; + int rc; + int charsRequired; + char *ret; + + parse_uri(s, &uri); + + if ((rc = uriNormalizeSyntaxA(&uri)) != URI_SUCCESS) + elog(ERROR, "uriNormalizeSyntaxA() failed: error code %d", rc); + + if ((rc = uriToStringCharsRequiredA(&uri, &charsRequired)) != URI_SUCCESS) + elog(ERROR, "uriToStringCharsRequiredA() failed: error code %d", rc); + charsRequired++; + + ret = palloc(charsRequired); + if ((rc = uriToStringA(ret, &uri, charsRequired, NULL)) != URI_SUCCESS) + elog(ERROR, "uriToStringA() failed: error code %d", rc); + + uriFreeUriMembersA(&uri); + + PG_RETURN_URI_P((uritype *) cstring_to_text(ret)); +} + static int cmp_text_range(UriTextRangeA a, UriTextRangeA b) { diff --git a/uri.sql b/uri.sql index 94be6c6..edf5261 100644 --- a/uri.sql +++ b/uri.sql @@ -81,6 +81,13 @@ CREATE FUNCTION uri_path_array(uri) RETURNS text[] AS '$libdir/uri'; +CREATE FUNCTION uri_normalize(uri) RETURNS uri + IMMUTABLE + STRICT + LANGUAGE C + AS '$libdir/uri'; + + CREATE FUNCTION uri_lt(uri, uri) RETURNS boolean IMMUTABLE STRICT