xref: /OpenGrok/opengrok-indexer/src/main/jflex/analysis/CommonURI.lexh (revision d219b4cea555a12b602d2d5518daa22134ad4879)
1*d219b4ceSAdam Hornacek/*
2*d219b4ceSAdam Hornacek * CDDL HEADER START
3*d219b4ceSAdam Hornacek *
4*d219b4ceSAdam Hornacek * The contents of this file are subject to the terms of the
5*d219b4ceSAdam Hornacek * Common Development and Distribution License (the "License").
6*d219b4ceSAdam Hornacek * You may not use this file except in compliance with the License.
7*d219b4ceSAdam Hornacek *
8*d219b4ceSAdam Hornacek * See LICENSE.txt included in this distribution for the specific
9*d219b4ceSAdam Hornacek * language governing permissions and limitations under the License.
10*d219b4ceSAdam Hornacek *
11*d219b4ceSAdam Hornacek * When distributing Covered Code, include this CDDL HEADER in each
12*d219b4ceSAdam Hornacek * file and include the License file at LICENSE.txt.
13*d219b4ceSAdam Hornacek * If applicable, add the following below this CDDL HEADER, with the
14*d219b4ceSAdam Hornacek * fields enclosed by brackets "[]" replaced with your own identifying
15*d219b4ceSAdam Hornacek * information: Portions Copyright [yyyy] [name of copyright owner]
16*d219b4ceSAdam Hornacek *
17*d219b4ceSAdam Hornacek * CDDL HEADER END
18*d219b4ceSAdam Hornacek */
19*d219b4ceSAdam Hornacek
20*d219b4ceSAdam Hornacek/*
21*d219b4ceSAdam Hornacek * Copyright (c) 2017, Chris Fraire <cfraire@me.com>.
22*d219b4ceSAdam Hornacek */
23*d219b4ceSAdam Hornacek
24*d219b4ceSAdam Hornacek/*
25*d219b4ceSAdam Hornacek * From RFC-3986. See
26*d219b4ceSAdam Hornacek * org.opengrok.indexer.util.StringUtils URI_CHARS_PAT where a regex
27*d219b4ceSAdam Hornacek * in accordance with the following definition is maintained.
28*d219b4ceSAdam Hornacek *
29*d219b4ceSAdam Hornacek * URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
30*d219b4ceSAdam Hornacek */
31*d219b4ceSAdam HornacekBrowseableURI  = {BrowseableURI_scheme} {URI_tail}
32*d219b4ceSAdam HornacekURI_tail       = ":" {URI_hier_part} ("?" {URI_query})? ("#" {URI_fragment})?
33*d219b4ceSAdam Hornacek
34*d219b4ceSAdam Hornacek/*
35*d219b4ceSAdam Hornacek * hier-part   = "//" authority path-abempty
36*d219b4ceSAdam Hornacek *                / path-absolute
37*d219b4ceSAdam Hornacek *                / path-rootless
38*d219b4ceSAdam Hornacek *                / path-empty ; N.b. not used in OpenGrok
39*d219b4ceSAdam Hornacek */
40*d219b4ceSAdam HornacekURI_hier_part  = ("//" {URI_authority} {URI_path_abempty} |
41*d219b4ceSAdam Hornacek    "/" ({URI_path_absolute} | {URI_path_rootless}))
42*d219b4ceSAdam Hornacek
43*d219b4ceSAdam Hornacek/*
44*d219b4ceSAdam Hornacek * scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
45*d219b4ceSAdam Hornacek */
46*d219b4ceSAdam HornacekBrowseableURI_scheme = ([Hh][Tt][Tt][Pp][Ss]? | [Ff][Tt][Pp])
47*d219b4ceSAdam Hornacek
48*d219b4ceSAdam Hornacek/*
49*d219b4ceSAdam Hornacek * authority     = [ userinfo "@" ] host [ ":" port ]
50*d219b4ceSAdam Hornacek * userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
51*d219b4ceSAdam Hornacek * host          = IP-literal / IPv4address / reg-name
52*d219b4ceSAdam Hornacek * port          = *DIGIT
53*d219b4ceSAdam Hornacek */
54*d219b4ceSAdam HornacekURI_authority    = ({URI_userinfo} "@")? {URI_host} (":" {URI_port})?
55*d219b4ceSAdam HornacekURI_userinfo     = ({URI_unreserved} | {URI_pct_encoded} | {URI_sub_delims} |
56*d219b4ceSAdam Hornacek    ":")*
57*d219b4ceSAdam HornacekURI_host         = ({URI_IP_literal} | {URI_IPv4address} | {URI_reg_name})
58*d219b4ceSAdam HornacekURI_port         = {DIGIT}*
59*d219b4ceSAdam Hornacek
60*d219b4ceSAdam Hornacek/*
61*d219b4ceSAdam Hornacek * IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
62*d219b4ceSAdam Hornacek */
63*d219b4ceSAdam HornacekURI_IP_literal   = "[" ({URI_IPv6address} | {URI_IPvFuture}) "]"
64*d219b4ceSAdam Hornacek
65*d219b4ceSAdam Hornacek/*
66*d219b4ceSAdam Hornacek * IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
67*d219b4ceSAdam Hornacek */
68*d219b4ceSAdam HornacekURI_IPvFuture    = "v" {HEXDIG}+ "." ({URI_unreserved} | {URI_sub_delims} |
69*d219b4ceSAdam Hornacek    ":")+
70*d219b4ceSAdam Hornacek
71*d219b4ceSAdam Hornacek/*
72*d219b4ceSAdam Hornacek * IPv6address   =                            6( h16 ":" ) ls32
73*d219b4ceSAdam Hornacek *               /                       "::" 5( h16 ":" ) ls32
74*d219b4ceSAdam Hornacek *               / [               h16 ] "::" 4( h16 ":" ) ls32
75*d219b4ceSAdam Hornacek *               / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
76*d219b4ceSAdam Hornacek *               / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
77*d219b4ceSAdam Hornacek *               / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
78*d219b4ceSAdam Hornacek *               / [ *4( h16 ":" ) h16 ] "::"              ls32
79*d219b4ceSAdam Hornacek *               / [ *5( h16 ":" ) h16 ] "::"              h16
80*d219b4ceSAdam Hornacek *               / [ *6( h16 ":" ) h16 ] "::"
81*d219b4ceSAdam Hornacek */
82*d219b4ceSAdam HornacekURI_IPv6address = (
83*d219b4ceSAdam Hornacek    ( {URI_h16} ":" ){6} {URI_ls32} |
84*d219b4ceSAdam Hornacek                                      "::" ({URI_h16} ":"){5} {URI_ls32} |
85*d219b4ceSAdam Hornacek                         ({URI_h16})? "::" ({URI_h16} ":"){4} {URI_ls32} |
86*d219b4ceSAdam Hornacek    (({URI_h16} ":"){0,1} {URI_h16})? "::" ({URI_h16} ":"){3} {URI_ls32} |
87*d219b4ceSAdam Hornacek    (({URI_h16} ":"){0,2} {URI_h16})? "::" ({URI_h16} ":"){2} {URI_ls32} |
88*d219b4ceSAdam Hornacek    (({URI_h16} ":"){0,3} {URI_h16})? "::"  {URI_h16} ":"     {URI_ls32} |
89*d219b4ceSAdam Hornacek    (({URI_h16} ":"){0,4} {URI_h16})? "::"                    {URI_ls32} |
90*d219b4ceSAdam Hornacek    (({URI_h16} ":"){0,5} {URI_h16})? "::"  {URI_h16} |
91*d219b4ceSAdam Hornacek    (({URI_h16} ":"){0,6} {URI_h16})? "::"
92*d219b4ceSAdam Hornacek    )
93*d219b4ceSAdam Hornacek/*
94*d219b4ceSAdam Hornacek * h16           = 1*4HEXDIG
95*d219b4ceSAdam Hornacek * ls32          = ( h16 ":" h16 ) / IPv4address
96*d219b4ceSAdam Hornacek * IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
97*d219b4ceSAdam Hornacek */
98*d219b4ceSAdam HornacekURI_h16          = {HEXDIG}{1,4}
99*d219b4ceSAdam HornacekURI_ls32         = ({URI_h16} ":" {URI_h16} | {URI_IPv4address})
100*d219b4ceSAdam HornacekURI_IPv4address  = ({URI_dec_octet} "." {URI_dec_octet} "." {URI_dec_octet}
101*d219b4ceSAdam Hornacek    "." {URI_dec_octet})
102*d219b4ceSAdam Hornacek
103*d219b4ceSAdam Hornacek/*
104*d219b4ceSAdam Hornacek * dec-octet     = DIGIT                 ; 0-9
105*d219b4ceSAdam Hornacek *               / %x31-39 DIGIT         ; 10-99
106*d219b4ceSAdam Hornacek *               / "1" 2DIGIT            ; 100-199
107*d219b4ceSAdam Hornacek *               / "2" %x30-34 DIGIT     ; 200-249
108*d219b4ceSAdam Hornacek *               / "25" %x30-35          ; 250-255
109*d219b4ceSAdam Hornacek */
110*d219b4ceSAdam HornacekURI_dec_octet    = ({DIGIT}  |     // 0-9
111*d219b4ceSAdam Hornacek    [\u{31}-\u{39}] {DIGIT}  |     // 10-99
112*d219b4ceSAdam Hornacek    "1" {DIGIT}{DIGIT}  |          // 100-199
113*d219b4ceSAdam Hornacek    "2" [\u{30}-\u{34}] {DIGIT}  | // 200-249
114*d219b4ceSAdam Hornacek    "25" [\u{30}-\u{35}])          // 250-255
115*d219b4ceSAdam Hornacek
116*d219b4ceSAdam Hornacek/*
117*d219b4ceSAdam Hornacek * reg-name      = *( unreserved / pct-encoded / sub-delims )
118*d219b4ceSAdam Hornacek */
119*d219b4ceSAdam HornacekURI_reg_name     = ({URI_unreserved} | {URI_pct_encoded} | {URI_sub_delims})*
120*d219b4ceSAdam Hornacek
121*d219b4ceSAdam Hornacek/*
122*d219b4ceSAdam Hornacek * path          = path-abempty    ; begins with "/" or is empty
123*d219b4ceSAdam Hornacek *               / path-absolute   ; begins with "/" but not "//"
124*d219b4ceSAdam Hornacek *               / path-noscheme   ; begins with a non-colon segment
125*d219b4ceSAdam Hornacek *               / path-rootless   ; begins with a segment
126*d219b4ceSAdam Hornacek *               / path-empty      ; zero characters
127*d219b4ceSAdam Hornacek *
128*d219b4ceSAdam Hornacek * path-abempty  = *( "/" segment )
129*d219b4ceSAdam Hornacek * path-absolute = "/" [ segment-nz *( "/" segment ) ]
130*d219b4ceSAdam Hornacek * path-noscheme = segment-nz-nc *( "/" segment )  ; N.b. not used in OpenGrok
131*d219b4ceSAdam Hornacek * path-rootless = segment-nz *( "/" segment )
132*d219b4ceSAdam Hornacek * path-empty    = 0<pchar>  ; N.b. not used in OpenGrok
133*d219b4ceSAdam Hornacek */
134*d219b4ceSAdam HornacekURI_path_abempty  = ("/" {URI_segment})*
135*d219b4ceSAdam HornacekURI_path_absolute = "/" ({URI_segment_nz} ("/" {URI_segment})*)?
136*d219b4ceSAdam HornacekURI_path_rootless = {URI_segment_nz} ("/" {URI_segment})*
137*d219b4ceSAdam Hornacek
138*d219b4ceSAdam Hornacek/*
139*d219b4ceSAdam Hornacek * segment       = *pchar
140*d219b4ceSAdam Hornacek * segment-nz    = 1*pchar
141*d219b4ceSAdam Hornacek * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
142*d219b4ceSAdam Hornacek *               ; non-zero-length segment without any colon ":"
143*d219b4ceSAdam Hornacek *               ; N.b. not used in OpenGrok
144*d219b4ceSAdam Hornacek * pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
145*d219b4ceSAdam Hornacek */
146*d219b4ceSAdam HornacekURI_segment      = {URI_pchar}*
147*d219b4ceSAdam HornacekURI_segment_nz   = {URI_pchar}+
148*d219b4ceSAdam HornacekURI_pchar        = ({URI_unreserved} | {URI_pct_encoded} | {URI_sub_delims} |
149*d219b4ceSAdam Hornacek    [:@])
150*d219b4ceSAdam Hornacek
151*d219b4ceSAdam Hornacek/*
152*d219b4ceSAdam Hornacek * query         = *( pchar / "/" / "?" )
153*d219b4ceSAdam Hornacek */
154*d219b4ceSAdam HornacekURI_query        = ({URI_pchar} | [/\?])*
155*d219b4ceSAdam Hornacek
156*d219b4ceSAdam Hornacek/*
157*d219b4ceSAdam Hornacek * fragment      = *( pchar / "/" / "?" )
158*d219b4ceSAdam Hornacek */
159*d219b4ceSAdam HornacekURI_fragment     = ({URI_pchar} | [/\?])*
160*d219b4ceSAdam Hornacek
161*d219b4ceSAdam Hornacek/*
162*d219b4ceSAdam Hornacek * pct-encoded   = "%" HEXDIG HEXDIG
163*d219b4ceSAdam Hornacek */
164*d219b4ceSAdam HornacekURI_pct_encoded  = "%" {HEXDIG} {HEXDIG}
165*d219b4ceSAdam Hornacek
166*d219b4ceSAdam Hornacek/*
167*d219b4ceSAdam Hornacek * unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
168*d219b4ceSAdam Hornacek * reserved      = gen-delims / sub-delims  ; N.b. not used in OpenGrok
169*d219b4ceSAdam Hornacek * gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
170*d219b4ceSAdam Hornacek *               ; N.b. not used in OpenGrok
171*d219b4ceSAdam Hornacek * sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
172*d219b4ceSAdam Hornacek *               / "*" / "+" / "," / ";" / "="
173*d219b4ceSAdam Hornacek */
174*d219b4ceSAdam HornacekURI_unreserved   = ({ASCII_ALPHA} | {DIGIT} | [\-\._~])
175*d219b4ceSAdam HornacekURI_sub_delims   = [\!\$&\'\(\)\*\+,;=]
176*d219b4ceSAdam Hornacek
177*d219b4ceSAdam HornacekASCII_ALPHA  = [A-Za-z]
178*d219b4ceSAdam HornacekHEXDIG = [0-9A-Fa-f]
179*d219b4ceSAdam HornacekDIGIT  = [0-9]
180