// Copyright (C) 2005 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_TOKENIZER_KERNEL_1_CPp_ #define DLIB_TOKENIZER_KERNEL_1_CPp_ #include "tokenizer_kernel_1.h" #include <iostream> #include <cstdio> namespace dlib { // ---------------------------------------------------------------------------------------- tokenizer_kernel_1:: tokenizer_kernel_1 ( ) : headset(0), bodyset(0), have_peeked(false) { try { headset = new bool[UCHAR_MAX]; bodyset = new bool[UCHAR_MAX]; clear(); } catch (...) { if (headset) delete [] headset; if (bodyset) delete [] bodyset; throw; } } // ---------------------------------------------------------------------------------------- tokenizer_kernel_1:: ~tokenizer_kernel_1 ( ) { delete [] bodyset; delete [] headset; } // ---------------------------------------------------------------------------------------- void tokenizer_kernel_1:: clear( ) { using namespace std; in = 0; streambuf = 0; have_peeked = false; head = "_" + lowercase_letters() + uppercase_letters(); body = "_" + lowercase_letters() + uppercase_letters() + numbers(); for (unsigned long i = 0; i < UCHAR_MAX; ++i) { headset[i] = false; bodyset[i] = false; } for (string::size_type i = 0; i < head.size(); ++i) headset[static_cast<unsigned char>(head[i])] = true; for (string::size_type i = 0; i < body.size(); ++i) bodyset[static_cast<unsigned char>(body[i])] = true; } // ---------------------------------------------------------------------------------------- void tokenizer_kernel_1:: set_stream ( std::istream& in_ ) { in = &in_; streambuf = in_.rdbuf(); have_peeked = false; } // ---------------------------------------------------------------------------------------- bool tokenizer_kernel_1:: stream_is_set ( ) const { return (in != 0); } // ---------------------------------------------------------------------------------------- std::istream& tokenizer_kernel_1:: get_stream ( ) const { return *in; } // ---------------------------------------------------------------------------------------- void tokenizer_kernel_1:: get_token ( int& type, std::string& token ) { if (!have_peeked) { std::streambuf::int_type ch; ch = streambuf->sbumpc(); switch (ch) { case EOF: type = END_OF_FILE; token.clear(); return; case '\n': type = END_OF_LINE; token = "\n"; return; case '\r': case ' ': case '\t': type = WHITE_SPACE; token = static_cast<char>(ch); ch = streambuf->sgetc(); while ((ch == ' ' || ch == '\t' || ch == '\r') && ch != EOF) { token += static_cast<char>(ch); ch = streambuf->snextc(); } return; default: if (headset[static_cast<unsigned char>(ch)]) { type = IDENTIFIER; token = static_cast<char>(ch); ch = streambuf->sgetc(); while ( bodyset[static_cast<unsigned char>(ch)] && ch != EOF ) { token += static_cast<char>(ch); ch = streambuf->snextc(); } } else if ('0' <= ch && ch <= '9') { type = NUMBER; token = static_cast<char>(ch); ch = streambuf->sgetc(); while (('0' <= ch && ch <= '9') && ch != EOF) { token += static_cast<char>(ch); ch = streambuf->snextc(); } } else { type = CHAR; token = static_cast<char>(ch); } return; } // switch (ch) } // if we get this far it means we have peeked so we should // return the peek data. type = next_type; token = next_token; have_peeked = false; } // ---------------------------------------------------------------------------------------- int tokenizer_kernel_1:: peek_type ( ) const { const_cast<tokenizer_kernel_1*>(this)->get_token(next_type,next_token); have_peeked = true; return next_type; } // ---------------------------------------------------------------------------------------- const std::string& tokenizer_kernel_1:: peek_token ( ) const { const_cast<tokenizer_kernel_1*>(this)->get_token(next_type,next_token); have_peeked = true; return next_token; } // ---------------------------------------------------------------------------------------- void tokenizer_kernel_1:: swap ( tokenizer_kernel_1& item ) { exchange(in,item.in); exchange(streambuf,item.streambuf); exchange(head,item.head); exchange(body,item.body); exchange(bodyset,item.bodyset); exchange(headset,item.headset); exchange(have_peeked,item.have_peeked); exchange(next_type,item.next_type); exchange(next_token,item.next_token); } // ---------------------------------------------------------------------------------------- void tokenizer_kernel_1:: set_identifier_token ( const std::string& head_, const std::string& body_ ) { using namespace std; head = head_; body = body_; for (unsigned long i = 0; i < UCHAR_MAX; ++i) { headset[i] = false; bodyset[i] = false; } for (string::size_type i = 0; i < head.size(); ++i) headset[static_cast<unsigned char>(head[i])] = true; for (string::size_type i = 0; i < body.size(); ++i) bodyset[static_cast<unsigned char>(body[i])] = true; } // ---------------------------------------------------------------------------------------- const std::string tokenizer_kernel_1:: get_identifier_head ( ) const { return head; } // ---------------------------------------------------------------------------------------- const std::string tokenizer_kernel_1:: get_identifier_body ( ) const { return body; } // ---------------------------------------------------------------------------------------- const std::string tokenizer_kernel_1:: lowercase_letters ( ) const { return std::string("abcdefghijklmnopqrstuvwxyz"); } // ---------------------------------------------------------------------------------------- const std::string tokenizer_kernel_1:: uppercase_letters ( ) const { return std::string("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); } // ---------------------------------------------------------------------------------------- const std::string tokenizer_kernel_1:: numbers ( ) const { return std::string("0123456789"); } // ---------------------------------------------------------------------------------------- } #endif // DLIB_TOKENIZER_KERNEL_1_CPp_