and rewrote it for Tcl. Holy RE, batman! It isn't perfect, doesn't match the latest URI spec [1], and needs a good test suite for verification, but watch this page!Just added IPv6 support. ${hsegment} has a problem picking-up closing parens, which is a legal character, but in the following context it is not: "See my website (http://www.example.com/
) for the answer to life, the universe and everything"...Stop the press... A sharp knife clued me in to the joke.. The proc is really a joke. And the joke was on me. Here's a dumb RE, just as I needed it:
(?:[[:alpha:]]?)(?:\w){2,7}:(?://?)(?:[^[:space:]>"]*)I'll leave the rest in case someone is building an over the top URI parser and wants to turn off all the non-reporting atoms :) Maybe some of the bits are useful. Have a nice day!AK: Tcllib's uri module has a similar set of regular expressions as well. Might benefit from the RE's given here too. proc makeURI_RE {} {
set nz_digit {[[:digit:]]}
set nz_digits "(?:$nz_digit+)"
set digits {(?:\d+)}
set space {(?:%20)}
set nl {(?:%0[Aa])}
set dot {\.}
set plus {\+}
set qm {\?}
set ast {\*}
set hex {[a-fA-F\d]}
set alpha {[[:alpha:]]}
set alphas "(?:${alpha}+)"
set alphanum {[[:alnum:]]}
set xalphanum "(?:${alphanum}|%(?:3\\d|\[46\]$hex|\[57\]\[Aa\\d\]))"
set alphanums "(?:${alphanum}+)"
set escape "(?:%$hex\{2\})"
set safe {[$\-_.+]}
set extra {[~!*'(),]}
set unwise {[{}|\\^[\]`]}
set punctuation {[[:punct:]]}
set reserved {[;/?:@&=]}
set uchar "(?:${alphanum}|${safe}|${extra}|${escape})"
set xchar "(?:${alphanum}|${safe}|${extra}|${reserved}|${escape})"
# URL schemeparts for ip based protocols:
set user "(?:(?:${uchar}|\[\;?&=\])*)"
set password "(?:(?:${uchar}|\[\;?&=\])*)"
set hostnumber "(?:${digits}(?:${dot}${digits}){3})"
set toplabel "(?:${alpha}(?:(?:${alphanum}|-)*${alphanum})?)"
set domainlabel "(?:${alphanum}(?:(?:${alphanum}|-)*${alphanum})?)"
# only dotted-4 allowed (maybe I'm being strict)
set IPv4address {(?:(?:25[0-5]|2[0-4]\d|[0-1]?\d?\d)(?:\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3})}
# IPv6, all forms.
set IPv6full {(?:(?:[[:xdigit:]]{1,4}:){7}[[:xdigit:]]{1,4})}
set IPv6hexcomp {(?:(?:(?:[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4})*)?)::((?:[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4})*)?))}
set IPv6hex4dec {(?:(?:(?:[[:xdigit:]]{1,4}:){6,6})(25[0-5]|2[0-4]\d|[0-1]?\d?\d)(\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3})}
set IPv6hex4deccomp {(?:(?:(?:[[:xdigit:]]{1,4}(?::[[:xdigit:]]{1,4})*)?)::(?:(?:[[:xdigit:]]{1,4}:)*)(25[0-5]|2[0-4]\d|[0-1]?\d?\d)(\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3})}
set IPv6address "(?:${IPv6full}|${IPv6hexcomp}|${IPv6hex4dec}|${IPv6hex4deccomp})"
set hostname "(?:(?:${domainlabel}${dot})*${toplabel})"
set host "(?:${hostname}|${IPv4address}|(?:\\\[${IPv6address}\\\]))"
set hostport "(?:${host}(?::${digits})?)"
set login "(?:(?:${user}(?::${password})?@)?${hostport})"
# The predefined schemes:
# FTP (see also RFC959)
set fsegment "(?:(?:${uchar}|\[?:@&=\])*)"
set fpath "(?:${fsegment}(?:/${fsegment})*)"
set ftpurl "(?:ftp://${login}(?:/${fpath}(?:;type=\[AIDaid\])?)?)"
# FILE
set fileurl "(?:file://(?:${host}|localhost)?/${fpath})"
# HTTP http://www.ietf.org/rfc/rfc2616.txt
set hsegment "(?:(?:${uchar}|\[\;:@&=\])*)"
set search "(?:(?:${uchar}|\[\;:@&=\])*)"
set hpath "(?:${hsegment}(?:/${hsegment})*)"
set httpurl "(?:http(?:s?)://${hostport}(?:/${hpath}(?:${qm}${search})?)?)"
# GOPHER (see also RFC1436)
set gopher_plus "(?:${xchar}*)"
set selector "(?:${xchar}*)"
set gtype ${xchar}
set gopherurl [join [list "(?:gopher://${hostport}(?:/${gtype}(?:${selector}(?:%09${search}" \
"(?:%09${gopher_plus})?)?)?)?)" ] ""]
# MAILTO (see also RFC822)
set encoded822addr "(?:$xchar+)"
set mailtourl "(?:mailto:$encoded822addr)"
# NEWS (see also RFC1036)
set article "(?:(?:${uchar}|\[\;/?:&=\])+@${host})"
set group "(?:${alpha}(?:${alphanum}|\[_.+-\])*)"
set grouppart "(?:${article}|${group}|${ast})"
set newsurl "(?:news:${grouppart})"
# NNTP (see also RFC977)
set nntpurl "(?:nntp://${hostport}/${group}(?:/${digits})?)"
# TELNET
set telneturl "(?:telnet://${login}/?)"
# WAIS (see also RFC1625)
set wpath "(?:${uchar}*)"
set wtype "(?:${uchar}*)"
set database "(?:${uchar}*)"
set waisdoc "(?:wais://${hostport}/${database}/${wtype}/${wpath})"
set waisindex "(?:wais://${hostport}/${database}${qm}${search})"
set waisdatabase "(?:wais://${hostport}/${database})"
# my $waisurl = "(?:${waisdatabase}|${waisindex}|${waisdoc})";
# Speed up: the 3 types share a common prefix.
set waisurl "(?:wais://${hostport}/${database}(?:(?:/${wtype}/${wpath})|${qm}${search})?)"
# PROSPERO
set fieldvalue "(?:(?:${uchar}|\[?:@&\])*)"
set fieldname "(?:(?:${uchar}|\[?:@&\])*)"
set fieldspec "(?:;${fieldname}=${fieldvalue})"
set psegment "(?:(?:${uchar}|\[?:@&=\])*)"
set ppath "(?:${psegment}(?:/${psegment})*)"
set prosperourl "(?:prospero://${hostport}/${ppath}(?:${fieldspec})*)"
# LDAP (see also RFC1959)
# First. import stuff from RFC 1779 (Distinguished Names).
# We've modified things a bit.
set dn_separator "(?:\[\;,\])"
set dn_optional_space "(?:${nl}?${space}*)"
set dn_spaced_separator "(?:${dn_optional_space}${dn_separator}${dn_optional_space})"
set dn_oid "(?:${digits}(?:${dot}${digits})*)"
set dn_keychar "(?:${xalphanum}|${space})"
set dn_key "(?:${dn_keychar}+|(?:OID|oid)${dot}${dn_oid})"
set dn_string "(?:${uchar}*)"
set dn_attribute "(?:(?:${dn_key}${dn_optional_space}=${dn_optional_space})?${dn_string})"
set dn_name_component "(?:${dn_attribute}(?:${dn_optional_space}${plus}${dn_optional_space}${dn_attribute})*)"
set dn_name [join [list "(?:${dn_name_component}(?:${dn_spaced_separator}${dn_name_component})*" \
"${dn_spaced_separator}?)" ] ""]
# RFC 1558 defines the filter syntax, but that requires a PDA to recognize.
# Since that's too powerful for Perl's REs, we allow any char between the
# parenthesis (which have to be there.)
set ldap_filter "(?:\(${xchar}+\))"
# This is from RFC 1777. It defines an attributetype as an 'OCTET STRING',
# whatever that is.
set ldap_attr_type "(?:${uchar}+)"
# Now we are at the grammar of RFC 1959.
set ldap_attr_list "(?:${ldap_attr_type}(?:,${ldap_attr_type})*)"
set ldap_attrs "(?:${ldap_attr_list}?)"
set ldap_scope "(?:base|one|sub)"
set ldapurl [join [list "(?:ldap://(?:${hostport})?/${dn_name}(?:${qm}${ldap_attrs}" \
"(?:${qm}${ldap_scope}(?:${qm}${ldap_filter})?)?)?)" ] ""]
# RFC 2056 defines the format of URLs for the Z39.50 protocol.
set z_database "(?:${uchar}+)"
set z_docid "(?:${uchar}+)"
set z_elementset "(?:${uchar}+)"
set z_recordsyntax "(?:${uchar}+)"
set z_scheme "(?:z39${dot}50\[rs\])"
set z39_50url [join [list "(?:${z_scheme}://${hostport}(?:/(?:${z_database}(?:${plus}" \
"${z_database})*(?:${qm}${z_docid})?)?(?:\;esn=${z_elementset})?" \
"(?:\;rs=${z_recordsyntax}(?:${plus}${z_recordsyntax})*)?))" ] ""]
# RFC 2111 defines the format for cid/mid URLs.
set url_addr_spec "(?:(?:${uchar}|\[;?:@&=\])*)"
set message_id $url_addr_spec
set content_id $url_addr_spec
set cidurl "(?:cid:${content_id})"
set midurl "(?:mid:${message_id}(?:/${content_id})?)"
# RFC 2122 defines the Vemmi URLs.
set vemmi_attr "(?:(?:${uchar}|\[/?:@&\])*)"
set vemmi_value "(?:(?:${uchar}|\[/?:@&\])*)"
set vemmi_service "(?:(?:${uchar}|\[/?:@&=\])*)"
set vemmi_param "(?:\;${vemmi_attr}=${vemmi_value})"
set vemmiurl "(?:vemmi://${hostport}(?:/${vemmi_service}(?:${vemmi_param}*))?)"
# RFC 2192 for IMAP URLs.
# Import from RFC 2060.
# set imap4_astring ""
# set imap4_search_key ""
# set imap4_section_text ""
set imap4_nz_number $nz_digits;
set achar "(?:${uchar}|\[&=~\])"
set bchar "(?:${uchar}|\[&=~:@/\])"
set enc_auth_type "(?:${achar}+)"
set enc_list_mbox "(?:${bchar}+)"
set enc_mailbox "(?:${bchar}+)"
set enc_search "(?:${bchar}+)"
set enc_section "(?:${bchar}+)"
set enc_user "(?:${achar}+)"
set i_auth "(?:\;\[Aa\]\[Uu\]\[Tt\]\[Hh\]=(?:${ast}|${enc_auth_type}))";
set i_list_type "(?:\[Ll\](?:\[Ii\]\[Ss\]\[Tt\]|\[Ss\]\[Uu\]\[Bb\]))";
set i_mailboxlist "(?:${enc_list_mbox}?\;\[Tt\]\[Yy\]\[Pp\]\[Ee\]=${i_list_type})";
set i_uidvalidity [join [list "(?:\;\[Uu\]\[Ii\]\[Dd\]\[Vv\]\[Aa\]\[Ll\]\[Ii\]\[Dd\]\[Ii\]" \
"\[Tt\]\[Yy\]=${imap4_nz_number})" ] ""]
set i_messagelist "(?:${enc_mailbox}(?:${qm}${enc_search})?(?:${i_uidvalidity})?)"
set i_section "(?:/\;\[Ss\]\[Ee\]\[Cc\]\[Tt\]\[Ii\]\[Oo\]\[Nn\]=${enc_section})"
set i_uid "(?:/\;\[Uu\]\[Ii\]\[Dd\]=${imap4_nz_number})"
set i_messagepart "(?:${enc_mailbox}(?:${i_uidvalidity})?${i_uid}(?:${i_section})?)"
set i_command "(?:${i_mailboxlist}|${i_messagelist}|${i_messagepart})"
set i_userauth "(?:(?:${enc_user}(?:${i_auth})?)|(?:${i_auth}(?:${enc_user})?))"
set i_server "(?:(?:${i_userauth}@)?${hostport})"
set imapurl "(?:imap://${i_server}/(?:$i_command)?)"
# RFC 2224 for NFS.
set nfs_mark {[$-_.!~*'(),]}
set nfs_unreserved "(?:${alphanum}|${nfs_mark})"
set nfs_pchar "(?:${nfs_unreserved}|${escape}|\[:@&=+\])"
set nfs_segment "(?:${nfs_pchar}*)"
set nfs_path_segs "(?:${nfs_segment}(?:/${nfs_segment})*)"
set nfs_url_path "(?:/?${nfs_path_segs})"
set nfs_rel_path "(?:${nfs_path_segs}?)"
set nfs_abs_path "(?:/${nfs_rel_path})"
set nfs_net_path "(?://${hostport}(?:${nfs_abs_path})?)"
set nfs_rel_url "(?:${nfs_net_path}|${nfs_abs_path}|${nfs_rel_path})"
set nfsurl "(?:nfs:${nfs_rel_url})"
# Combining all the different URL formats into a single regex.
return [join [list $httpurl $ftpurl $newsurl $nntpurl \
$telneturl $gopherurl $waisurl $mailtourl \
$fileurl $prosperourl $ldapurl $z39_50url \
$cidurl $midurl $vemmiurl $imapurl $nfsurl] |]
}
