A Simple Mathematical Expression Tokenizer in C++
How to convert a mathematical expression string into a sequence of units called tokens in C++.

Back in college, I was given the task of implementing a math expression tokenizer function in C++.
It’s similar to LLM tokenization, but also quite different.
Given the following math expression as a string: (A+B)-9*X
The tokens are: [(, A, +, B, ), -, 9, *, X]
Another example with this math expression: 100+VARX+VARY-VARZ^2
The tokens are: [100, +, VARX, +, VARY, -, VARZ, ^, 2]
Here’s my implementation:
#include <iostream>
using namespace std;
bool isAlphabet(char chr)
{
return ((chr >= 'a' && chr <= 'z') ||
(chr >= 'A' && chr <= 'Z') ||
chr == '$' || chr == '_');
}
bool isNumber(char chr)
{
return (chr >= '0' && chr <= '9');
}
bool isOperator(char chr)
{
return (chr == '+' || chr == '-' || chr == '/' || chr == '*' || chr == '^');
}
bool isBranch(char chr)
{
return (chr == '(' || chr == ')');
}
bool isNull(char chr)
{
return chr == '\0';
}
bool isSpace(char chr)
{
return chr == ' ';
}
typedef struct MathTokenizer
{
// the expression string
string str;
// pointer of char of expression
char *pointer;
// reset pointer to the first character of the expression
void reset()
{
pointer = &str.at(0);
}
// get token from expression
string getToken()
{
string result = "";
// if pointer is null then end this process
if (isNull(*pointer))
{
return "";
}
// if pointer is space then check next
// ignore spaces
else if (isSpace(*pointer))
{
pointer++;
while (true)
{
if (isNull(*pointer))
{
return "";
}
else if (isSpace(*pointer))
{
pointer++;
}
else
{
break;
}
}
}
// if operator or branch
if (isOperator(*pointer) || isBranch(*pointer))
{
result = *pointer;
pointer++;
}
// if variable / alphabet
else if (isAlphabet(*pointer))
{
result = *pointer;
pointer++;
while (true)
{
if (isAlphabet(*pointer) || isNumber(*pointer))
{
result += *pointer;
}
else
{
break;
}
pointer++;
}
}
// if number
else if (isNumber(*pointer))
{
result = *pointer;
pointer++;
while (true)
{
if (isNumber(*pointer))
{
result += *pointer;
}
else
{
break;
}
pointer++;
}
}
return result;
}
} MathTokenizer;
int main()
{
MathTokenizer exp;
string s;
while (true)
{
// input expression
getline(cin, exp.str);
exp.reset();
// show tokens
while ((s = exp.getToken()) != "")
{
printf("%s\n", s.c_str());
}
}
return 0;
}
Get the complete source on my Git repository.
This is a rewrite of my old blog post of the same title, with more proper language.