xref: /OpenGrok/opengrok-indexer/src/main/jflex/analysis/CommonURI.lexh (revision d219b4cea555a12b602d2d5518daa22134ad4879)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * See LICENSE.txt included in this distribution for the specific
9 * language governing permissions and limitations under the License.
10 *
11 * When distributing Covered Code, include this CDDL HEADER in each
12 * file and include the License file at LICENSE.txt.
13 * If applicable, add the following below this CDDL HEADER, with the
14 * fields enclosed by brackets "[]" replaced with your own identifying
15 * information: Portions Copyright [yyyy] [name of copyright owner]
16 *
17 * CDDL HEADER END
18 */
19
20/*
21 * Copyright (c) 2017, Chris Fraire <cfraire@me.com>.
22 */
23
24/*
25 * From RFC-3986. See
26 * org.opengrok.indexer.util.StringUtils URI_CHARS_PAT where a regex
27 * in accordance with the following definition is maintained.
28 *
29 * URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
30 */
31BrowseableURI  = {BrowseableURI_scheme} {URI_tail}
32URI_tail       = ":" {URI_hier_part} ("?" {URI_query})? ("#" {URI_fragment})?
33
34/*
35 * hier-part   = "//" authority path-abempty
36 *                / path-absolute
37 *                / path-rootless
38 *                / path-empty ; N.b. not used in OpenGrok
39 */
40URI_hier_part  = ("//" {URI_authority} {URI_path_abempty} |
41    "/" ({URI_path_absolute} | {URI_path_rootless}))
42
43/*
44 * scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
45 */
46BrowseableURI_scheme = ([Hh][Tt][Tt][Pp][Ss]? | [Ff][Tt][Pp])
47
48/*
49 * authority     = [ userinfo "@" ] host [ ":" port ]
50 * userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
51 * host          = IP-literal / IPv4address / reg-name
52 * port          = *DIGIT
53 */
54URI_authority    = ({URI_userinfo} "@")? {URI_host} (":" {URI_port})?
55URI_userinfo     = ({URI_unreserved} | {URI_pct_encoded} | {URI_sub_delims} |
56    ":")*
57URI_host         = ({URI_IP_literal} | {URI_IPv4address} | {URI_reg_name})
58URI_port         = {DIGIT}*
59
60/*
61 * IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
62 */
63URI_IP_literal   = "[" ({URI_IPv6address} | {URI_IPvFuture}) "]"
64
65/*
66 * IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
67 */
68URI_IPvFuture    = "v" {HEXDIG}+ "." ({URI_unreserved} | {URI_sub_delims} |
69    ":")+
70
71/*
72 * IPv6address   =                            6( h16 ":" ) ls32
73 *               /                       "::" 5( h16 ":" ) ls32
74 *               / [               h16 ] "::" 4( h16 ":" ) ls32
75 *               / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
76 *               / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
77 *               / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
78 *               / [ *4( h16 ":" ) h16 ] "::"              ls32
79 *               / [ *5( h16 ":" ) h16 ] "::"              h16
80 *               / [ *6( h16 ":" ) h16 ] "::"
81 */
82URI_IPv6address = (
83    ( {URI_h16} ":" ){6} {URI_ls32} |
84                                      "::" ({URI_h16} ":"){5} {URI_ls32} |
85                         ({URI_h16})? "::" ({URI_h16} ":"){4} {URI_ls32} |
86    (({URI_h16} ":"){0,1} {URI_h16})? "::" ({URI_h16} ":"){3} {URI_ls32} |
87    (({URI_h16} ":"){0,2} {URI_h16})? "::" ({URI_h16} ":"){2} {URI_ls32} |
88    (({URI_h16} ":"){0,3} {URI_h16})? "::"  {URI_h16} ":"     {URI_ls32} |
89    (({URI_h16} ":"){0,4} {URI_h16})? "::"                    {URI_ls32} |
90    (({URI_h16} ":"){0,5} {URI_h16})? "::"  {URI_h16} |
91    (({URI_h16} ":"){0,6} {URI_h16})? "::"
92    )
93/*
94 * h16           = 1*4HEXDIG
95 * ls32          = ( h16 ":" h16 ) / IPv4address
96 * IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
97 */
98URI_h16          = {HEXDIG}{1,4}
99URI_ls32         = ({URI_h16} ":" {URI_h16} | {URI_IPv4address})
100URI_IPv4address  = ({URI_dec_octet} "." {URI_dec_octet} "." {URI_dec_octet}
101    "." {URI_dec_octet})
102
103/*
104 * dec-octet     = DIGIT                 ; 0-9
105 *               / %x31-39 DIGIT         ; 10-99
106 *               / "1" 2DIGIT            ; 100-199
107 *               / "2" %x30-34 DIGIT     ; 200-249
108 *               / "25" %x30-35          ; 250-255
109 */
110URI_dec_octet    = ({DIGIT}  |     // 0-9
111    [\u{31}-\u{39}] {DIGIT}  |     // 10-99
112    "1" {DIGIT}{DIGIT}  |          // 100-199
113    "2" [\u{30}-\u{34}] {DIGIT}  | // 200-249
114    "25" [\u{30}-\u{35}])          // 250-255
115
116/*
117 * reg-name      = *( unreserved / pct-encoded / sub-delims )
118 */
119URI_reg_name     = ({URI_unreserved} | {URI_pct_encoded} | {URI_sub_delims})*
120
121/*
122 * path          = path-abempty    ; begins with "/" or is empty
123 *               / path-absolute   ; begins with "/" but not "//"
124 *               / path-noscheme   ; begins with a non-colon segment
125 *               / path-rootless   ; begins with a segment
126 *               / path-empty      ; zero characters
127 *
128 * path-abempty  = *( "/" segment )
129 * path-absolute = "/" [ segment-nz *( "/" segment ) ]
130 * path-noscheme = segment-nz-nc *( "/" segment )  ; N.b. not used in OpenGrok
131 * path-rootless = segment-nz *( "/" segment )
132 * path-empty    = 0<pchar>  ; N.b. not used in OpenGrok
133 */
134URI_path_abempty  = ("/" {URI_segment})*
135URI_path_absolute = "/" ({URI_segment_nz} ("/" {URI_segment})*)?
136URI_path_rootless = {URI_segment_nz} ("/" {URI_segment})*
137
138/*
139 * segment       = *pchar
140 * segment-nz    = 1*pchar
141 * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
142 *               ; non-zero-length segment without any colon ":"
143 *               ; N.b. not used in OpenGrok
144 * pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
145 */
146URI_segment      = {URI_pchar}*
147URI_segment_nz   = {URI_pchar}+
148URI_pchar        = ({URI_unreserved} | {URI_pct_encoded} | {URI_sub_delims} |
149    [:@])
150
151/*
152 * query         = *( pchar / "/" / "?" )
153 */
154URI_query        = ({URI_pchar} | [/\?])*
155
156/*
157 * fragment      = *( pchar / "/" / "?" )
158 */
159URI_fragment     = ({URI_pchar} | [/\?])*
160
161/*
162 * pct-encoded   = "%" HEXDIG HEXDIG
163 */
164URI_pct_encoded  = "%" {HEXDIG} {HEXDIG}
165
166/*
167 * unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
168 * reserved      = gen-delims / sub-delims  ; N.b. not used in OpenGrok
169 * gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
170 *               ; N.b. not used in OpenGrok
171 * sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
172 *               / "*" / "+" / "," / ";" / "="
173 */
174URI_unreserved   = ({ASCII_ALPHA} | {DIGIT} | [\-\._~])
175URI_sub_delims   = [\!\$&\'\(\)\*\+,;=]
176
177ASCII_ALPHA  = [A-Za-z]
178HEXDIG = [0-9A-Fa-f]
179DIGIT  = [0-9]
180