/*

   2   * \brief  Tokenizer support

   3   * \author Norman Feske

   4   * \date   2006-05-19

   5   */

6
7

/*

   8   * Copyright (C) 2006-2013 Genode Labs GmbH

   9   *

  10   * This file is part of the Genode OS framework, which is distributed

  11   * under the terms of the GNU General Public License version 2.

  12   */

  13
  14  #ifndef _INCLUDE__UTIL__TOKEN_H_
  15  #define _INCLUDE__UTIL__TOKEN_H_
  16
  17  #include <util/string.h>
  18
  19

namespace Genode {

  20  

  21     /**

  22      * Scanner policy that accepts underline characters in identifiers

  23      */

  24     struct Scanner_policy_identifier_with_underline

  25     {

  26        /**

  27         * Return true if character belongs to a valid identifier

  28         *

  29         * \param c  character

  30         * \param i  index of character in token

  31         * \return   true if character is a valid identifier character

  32         *

  33         * Letters and underline characters are allowed anywhere in an

  34         * identifier, digits must not appear at the beginning.

  35         */

  36        static bool identifier_char(char c, unsigned i) {

  37           return is_letter(c) || (c == `_`) || (i && is_digit(c)); }

  38     };

  39  

  40     /**

  41      * Token

  42      *

  43      * This class is used to group characters of a string which belong

  44      * to one syntactical token types number, identifier, string,

  45      * whitespace or another single character.

  46      *

  47      * \param SCANNER_POLICY  policy that defines the way of token scanning

  48      *

  49      * See `Scanner_policy_identifier_with_underline` for an example scanner

  50      * policy.

  51      */

  52     template <typename SCANNER_POLICY>

  53     class Token

  54     {

  55        public:

  56  

  57           enum Type { SINGLECHAR, NUMBER, IDENT, STRING, WHITESPACE, END };

  58  

  59           /**

  60            * Constructor

  61            *

  62            * \param s        start of string to construct a token from

  63            * \param max_len  maximum token length

  64            *

  65            * The `max_len` argument is useful for processing character arrays

  66            * that are not null-terminated.

  67            */

  68           Token(const char *s = 0, size_t max_len = ~0UL)

  69           : _start(s), _max_len(max_len), _len(s ? _calc_len(max_len) : 0) { }

  70  

  71           /**

  72            * Accessors

  73            */

  74           char *start() const { return (char *)_start; }

  75           size_t  len() const { return _len; }

  76           Type   type() const { return _type(_len); }

  77  

  78           /**

  79            * Return token as null-terminated string

  80            */

  81           void string(char *dst, size_t max_len) const {

  82              strncpy(dst, start(), min(len() + 1, max_len)); }

  83  

  84           /**

  85            * Return true if token is valid

  86            */

  87           operator bool () const { return _start && _len; }

  88  

  89           /**

  90            * Access single characters of token

  91            */

  92           char operator [] (int idx)

  93           {

  94              return ((idx >= 0) && ((unsigned)idx < _len)) ? _start[idx] : 0;

  95           }

  96  

  97           /**

  98            * Return next token

  99            */

 100           Token next() const { return Token(_start + _len, _max_len - _len); }

 101  

 102           /**

 103            * Return next non-whitespace token

 104            */

 105           Token eat_whitespace() const { return (_type(_len) == WHITESPACE) ? next() : *this; }

 106  

 107        private:

 108  

 109           const char *_start;

 110           size_t      _max_len;

 111           size_t      _len;

 112  

 113           /**

 114            * Return type of token

 115            *

 116            * \param  max_len  maximum token length

 117            *

 118            * This function is used during the construction of `Token`

 119            * objects, in particular for determining the value of the `_len`

 120            * member. Therefore, we explicitely pass the `max_len` to the

 121            * function. For the public interface, there exists the `type()`

 122            * accessor, which relies on `_len` as implicit argument.

 123            */

 124           Type _type(size_t max_len) const

 125           {

 126              if (!_start || max_len < 1 || !*_start) return END;

 127  

 128              /* determine the type based on the first character */

 129              char c = *_start;

 130              if (SCANNER_POLICY::identifier_char(c, 0)) return IDENT;

 131              if (is_digit(c))                           return NUMBER;

 132              if (is_whitespace(c))                      return WHITESPACE;

 133  

 134              /* if string is incomplete, discard it (type END) */

 135              if (c == `"`)

 136                 return _quoted_string_len(max_len) ? STRING : END;

 137  

 138              return SINGLECHAR;

 139           }

 140  

 141           size_t _quoted_string_len(size_t max_len) const

 142           {

 143              unsigned i = 0;

 144  

 145              for (; !end_of_quote(&_start[i]) && i < max_len; i++)

 146  

 147                 /* string ends without final quotation mark? too bad! */

 148                 if (!_start[i]) return 0;

 149  

 150              /* exceeded maximum token length */

 151              if (i == max_len) return 0;

 152  

 153              /*

 154               * We stopped our search at the character before the

 155               * final quotation mark but we return the number of

 156               * characters including the quotation marks.

 157               */

 158              return i + 2;

 159           }

 160  

 161           /**

 162            * Return length of token

 163            */

 164           int _calc_len(size_t max_len) const

 165           {

 166              switch (_type(max_len)) {

 167  

 168              case SINGLECHAR:

 169                 return 1;

 170  

 171              case NUMBER:

 172                 {

 173                    unsigned i = 0;

 174                    for (; i < max_len && is_digit(_start[i]); i++);

 175                    return i;

 176                 }

 177  

 178              case IDENT:

 179                 {

 180                    unsigned i = 0;

 181                    for (; i < max_len; i++) {

 182                       if (SCANNER_POLICY::identifier_char(_start[i], i))

 183                          continue;

 184  

 185                       /* stop if any other (invalid) character occurs */

 186                       break;

 187                    }

 188                    return i;

 189                 }

 190  

 191              case STRING:

 192  

 193                 return _quoted_string_len(max_len);

 194  

 195              case WHITESPACE:

 196                 {

 197                    unsigned i = 0;

 198                    for (; is_whitespace(_start[i]) && i < max_len; i++);

 199                    return i;

 200                 }

 201  

 202              case END:

 203              default:

 204                 return 0;

 205              }

 206           }

 207     };

 208  }

209
210 #endif /* _INCLUDE__UTIL__TOKEN_H_ */