1 /*
2 * \brief Tokenizer support
3 * \author Norman Feske
4 * \date 2006-05-19
5 */
6
7 /*
8 * Copyright (C) 2006-2013 Genode Labs GmbH
9 *
10 * This file is part of the Genode OS framework, which is distributed
11 * under the terms of the GNU General Public License version 2.
12 */
13
14 #ifndef _INCLUDE__UTIL__TOKEN_H_
15 #define _INCLUDE__UTIL__TOKEN_H_
16
17 #include <util/string.h>
18
19 namespace Genode {
20
21 /**
22 * Scanner policy that accepts underline characters in identifiers
23 */
24 struct Scanner_policy_identifier_with_underline
25 {
26 /**
27 * Return true if character belongs to a valid identifier
28 *
29 * \param c character
30 * \param i index of character in token
31 * \return true if character is a valid identifier character
32 *
33 * Letters and underline characters are allowed anywhere in an
34 * identifier, digits must not appear at the beginning.
35 */
36 static bool identifier_char(char c, unsigned i) {
37 return is_letter(c) || (c == `_`) || (i && is_digit(c)); }
38 };
39
40 /**
41 * Token
42 *
43 * This class is used to group characters of a string which belong
44 * to one syntactical token types number, identifier, string,
45 * whitespace or another single character.
46 *
47 * \param SCANNER_POLICY policy that defines the way of token scanning
48 *
49 * See `Scanner_policy_identifier_with_underline` for an example scanner
50 * policy.
51 */
52 template <typename SCANNER_POLICY>
53 class Token
54 {
55 public:
56
57 enum Type { SINGLECHAR, NUMBER, IDENT, STRING, WHITESPACE, END };
58
59 /**
60 * Constructor
61 *
62 * \param s start of string to construct a token from
63 * \param max_len maximum token length
64 *
65 * The `max_len` argument is useful for processing character arrays
66 * that are not null-terminated.
67 */
68 Token(const char *s = 0, size_t max_len = ~0UL)
69 : _start(s), _max_len(max_len), _len(s ? _calc_len(max_len) : 0) { }
70
71 /**
72 * Accessors
73 */
74 char *start() const { return (char *)_start; }
75 size_t len() const { return _len; }
76 Type type() const { return _type(_len); }
77
78 /**
79 * Return token as null-terminated string
80 */
81 void string(char *dst, size_t max_len) const {
82 strncpy(dst, start(), min(len() + 1, max_len)); }
83
84 /**
85 * Return true if token is valid
86 */
87 operator bool () const { return _start && _len; }
88
89 /**
90 * Access single characters of token
91 */
92 char operator [] (int idx)
93 {
94 return ((idx >= 0) && ((unsigned)idx < _len)) ? _start[idx] : 0;
95 }
96
97 /**
98 * Return next token
99 */
100 Token next() const { return Token(_start + _len, _max_len - _len); }
101
102 /**
103 * Return next non-whitespace token
104 */
105 Token eat_whitespace() const { return (_type(_len) == WHITESPACE) ? next() : *this; }
106
107 private:
108
109 const char *_start;
110 size_t _max_len;
111 size_t _len;
112
113 /**
114 * Return type of token
115 *
116 * \param max_len maximum token length
117 *
118 * This function is used during the construction of `Token`
119 * objects, in particular for determining the value of the `_len`
120 * member. Therefore, we explicitely pass the `max_len` to the
121 * function. For the public interface, there exists the `type()`
122 * accessor, which relies on `_len` as implicit argument.
123 */
124 Type _type(size_t max_len) const
125 {
126 if (!_start || max_len < 1 || !*_start) return END;
127
128 /* determine the type based on the first character */
129 char c = *_start;
130 if (SCANNER_POLICY::identifier_char(c, 0)) return IDENT;
131 if (is_digit(c)) return NUMBER;
132 if (is_whitespace(c)) return WHITESPACE;
133
134 /* if string is incomplete, discard it (type END) */
135 if (c == `"`)
136 return _quoted_string_len(max_len) ? STRING : END;
137
138 return SINGLECHAR;
139 }
140
141 size_t _quoted_string_len(size_t max_len) const
142 {
143 unsigned i = 0;
144
145 for (; !end_of_quote(&_start[i]) && i < max_len; i++)
146
147 /* string ends without final quotation mark? too bad! */
148 if (!_start[i]) return 0;
149
150 /* exceeded maximum token length */
151 if (i == max_len) return 0;
152
153 /*
154 * We stopped our search at the character before the
155 * final quotation mark but we return the number of
156 * characters including the quotation marks.
157 */
158 return i + 2;
159 }
160
161 /**
162 * Return length of token
163 */
164 int _calc_len(size_t max_len) const
165 {
166 switch (_type(max_len)) {
167
168 case SINGLECHAR:
169 return 1;
170
171 case NUMBER:
172 {
173 unsigned i = 0;
174 for (; i < max_len && is_digit(_start[i]); i++);
175 return i;
176 }
177
178 case IDENT:
179 {
180 unsigned i = 0;
181 for (; i < max_len; i++) {
182 if (SCANNER_POLICY::identifier_char(_start[i], i))
183 continue;
184
185 /* stop if any other (invalid) character occurs */
186 break;
187 }
188 return i;
189 }
190
191 case STRING:
192
193 return _quoted_string_len(max_len);
194
195 case WHITESPACE:
196 {
197 unsigned i = 0;
198 for (; is_whitespace(_start[i]) && i < max_len; i++);
199 return i;
200 }
201
202 case END:
203 default:
204 return 0;
205 }
206 }
207 };
208 }
209
210 #endif /* _INCLUDE__UTIL__TOKEN_H_ */