OK. The following is a manual approach to lexing such formulas. I've
assumed that they can contain whitespace.
-------------------------------------- chemical-elements.hpp
#pragma once
#include <string_view>
namespace chemical
{
using std::string_view;
struct Element
{
int number;
string_view symbol;
string_view name;
};
constexpr Element elements[] =
{
{ 1, "H", "Hydrogen" },
{ 2, "He", "Helium" },
{ 3, "Li", "Lithium" },
{ 4, "Be", "Beryllium" },
{ 5, "B", "Boron" },
{ 6, "C", "Carbon" },
{ 7, "N", "Nitrogen" },
{ 8, "O", "Oxygen" },
{ 9, "F", "Fluorine" },
{ 10, "Ne", "Neon" },
{ 11, "Na", "Sodium" },
{ 12, "Mg", "Magnesium" },
{ 13, "Al", "Aluminium" },
{ 14, "Si", "Silicon" },
{ 15, "P", "Phosphorus" },
{ 16, "S", "Sulfur" },
{ 17, "Cl", "Chlorine" },
{ 18, "Ar", "Argon" },
{ 19, "K", "Potassium" },
{ 20, "Ca", "Calcium" },
{ 21, "Sc", "Scandium" },
{ 22, "Ti", "Titanium" },
{ 23, "V", "Vanadium" },
{ 24, "Cr", "Chromium" },
{ 25, "Mn", "Manganese" },
{ 26, "Fe", "Iron" },
{ 27, "Co", "Cobalt" },
{ 28, "Ni", "Nickel" },
{ 29, "Cu", "Copper" },
{ 30, "Zn", "Zinc" },
{ 31, "Ga", "Gallium" },
{ 32, "Ge", "Germanium" },
{ 33, "As", "Arsenic" },
{ 34, "Se", "Selenium" },
{ 35, "Br", "Bromine" },
{ 36, "Kr", "Krypton" },
{ 37, "Rb", "Rubidium" },
{ 38, "Sr", "Strontium" },
{ 39, "Y", "Yttrium" },
{ 40, "Zr", "Zirconium" },
{ 41, "Nb", "Niobium" },
{ 42, "Mo", "Molybdenum" },
{ 43, "Tc", "Technetium" },
{ 44, "Ru", "Ruthenium" },
{ 45, "Rh", "Rhodium" },
{ 46, "Pd", "Palladium" },
{ 47, "Ag", "Silver" },
{ 48, "Cd", "Cadmium" },
{ 49, "In", "Indium" },
{ 50, "Sn", "Tin" },
{ 51, "Sb", "Antimony" },
{ 52, "Te", "Tellurium" },
{ 53, "I", "Iodine" },
{ 54, "Xe", "Xenon" },
{ 55, "Cs", "Caesium" },
{ 56, "Ba", "Barium" },
{ 57, "La", "Lanthanum" },
{ 58, "Ce", "Cerium" },
{ 59, "Pr", "Praseodymium" },
{ 60, "Nd", "Neodymium" },
{ 61, "Pm", "Promethium" },
{ 62, "Sm", "Samarium" },
{ 63, "Eu", "Europium" },
{ 64, "Gd", "Gadolinium" },
{ 65, "Tb", "Terbium" },
{ 66, "Dy", "Dysprosium" },
{ 67, "Ho", "Holmium" },
{ 68, "Er", "Erbium" },
{ 69, "Tm", "Thulium" },
{ 70, "Yb", "Ytterbium" },
{ 71, "Lu", "Lutetium" },
{ 72, "Hf", "Hafnium" },
{ 73, "Ta", "Tantalum" },
{ 74, "W", "Tungsten" },
{ 75, "Re", "Rhenium" },
{ 76, "Os", "Osmium" },
{ 77, "Ir", "Iridium" },
{ 78, "Pt", "Platinum" },
{ 79, "Au", "Gold" },
{ 80, "Hg", "Mercury" },
{ 81, "Tl", "Thallium" },
{ 82, "Pb", "Lead" },
{ 83, "Bi", "Bismuth" },
{ 84, "Po", "Polonium" },
{ 85, "At", "Astatine" },
{ 86, "Rn", "Radon" },
{ 87, "Fr", "Francium" },
{ 88, "Ra", "Radium" },
{ 89, "Ac", "Actinium" },
{ 90, "Th", "Thorium" },
{ 91, "Pa", "Protactinium" },
{ 92, "U", "Uranium" },
{ 93, "Np", "Neptunium" },
{ 94, "Pu", "Plutonium" },
{ 95, "Am", "Americium" },
{ 96, "Cm", "Curium" },
{ 97, "Bk", "Berkelium" },
{ 98, "Cf", "Californium" },
{ 99, "Es", "Einsteinium" },
{ 100, "Fm", "Fermium" },
{ 101, "Md", "Mendelevium" },
{ 102, "No", "Nobelium" },
{ 103, "Lr", "Lawrencium" },
{ 104, "Rf", "Rutherfordium" },
{ 105, "Db", "Dubnium" },
{ 106, "Sg", "Seaborgium" },
{ 107, "Bh", "Bohrium" },
{ 108, "Hs", "Hassium" },
{ 109, "Mt", "Meitnerium" },
{ 110, "Ds", "Darmstadtium" },
{ 111, "Rg", "Roentgenium" },
{ 112, "Cn", "Copernicium" },
{ 113, "Nh", "Nihonium" },
{ 114, "Fl", "Flerovium" },
{ 115, "Mc", "Moscovium" },
{ 116, "Lv", "Livermorium" },
{ 117, "Ts", "Tennessine" },
{ 118, "Og", "Oganesson" }
};
} // namespace chemical
-------------------------------------- chemical_formula-Tokenizer.hpp
#pragma once
#include "chemical-elements.hpp" // chemical::(Element, elements)
#include <cppx-core/all.hpp> // <url:
https://github.com/alf-p-steinbach/cppx-core>
namespace chemical_formula
{
$use_std(
stoi, string_view
);
$use_cppx(
Map_, // A std::unordered_map with [] indexing of
const instance.
P_, // P_<T> is an alias for T*. It supports prefix
const.
p_first_of, p_beyond_of
);
namespace ascii = cppx::ascii; // ascii::is_*
struct Token
{
struct Kind{ enum Enum {
none,
element,
number,
left_parens = '(', right_parens = ')',
left_bracket = '[', right_bracket = ']'
}; };
Kind::Enum kind;
string_view text;
int n; // Only used for `element` and
`number`.
};
class Tokenizer
{
const string_view m_formula;
const P_<const char> m_p_beyond_formula;
Token m_current;
struct Symbols_to_elements_map:
Map_<string_view, P_<const chemical::Element>>
{
Symbols_to_elements_map()
{
auto& self = *this;
for( const chemical::Element& elem : chemical::elements ) {
self[elem.symbol] = &elem;
}
}
};
auto is_in_formula( const P_<const char> p )
-> bool
{ return p != m_p_beyond_formula; }
void find_token_that_starts_at( const P_<const char> p_start )
{
static const auto symbols = Symbols_to_elements_map();
const char first_char = *p_start;
if( ascii::is_digit( first_char ) ) {
P_<const char> p_beyond = p_start + 1;
while( is_in_formula( p_beyond ) and ascii::is_digit(
*p_beyond ) ) {
++p_beyond;
}
const auto text = string_view( p_start, p_beyond -
p_start );
try {
m_current = { Token::Kind::number, text, stoi(
p_start ) };
} catch( ... ) {
m_current = { Token::Kind::none, text, -1 };
}
} else if( ascii::is_uppercase( first_char ) ) {
P_<const char> p_beyond = p_start + 1;
while( is_in_formula( p_beyond ) and
ascii::is_lowercase( *p_beyond ) ) {
++p_beyond;
}
const auto text = string_view( p_start, p_beyond -
p_start );
try {
m_current = { Token::Kind::element, text,
symbols[text]->number };
} catch( ... ) {
m_current = { Token::Kind::none, text, -1 };
}
} else {
const auto text = string_view( p_start, 1 );
switch( first_char ) {
case '(': [[fallthrough]];
case ')': [[fallthrough]];
case '[': [[fallthrough]];
case ']': {
m_current = { Token::Kind::Enum( first_char ),
text, -1 };
break;
}
default: {
m_current = { Token::Kind::none, text, -1 };
}
}
}
}
void find_next_remaining_token()
{
P_<const char> p_start = p_beyond_of( m_current.text );
while( is_in_formula( p_start ) and ascii::is_whitespace(
*p_start ) ) {
++p_start;
}
if( p_start == m_p_beyond_formula ) {
m_current = { Token::Kind::none, string_view( p_start,
0 ), -1 };
return;
}
find_token_that_starts_at( p_start );
}
public:
auto current() const
-> Token
{ return m_current; }
auto is_at_end() const
-> bool
{ return p_first_of( m_current.text ) == p_beyond_of( m_formula
); }
void advance()
{
find_next_remaining_token();
}
Tokenizer( const string_view& formula ):
m_formula( formula ),
m_p_beyond_formula( p_beyond_of( formula ) ),
m_current{ Token::Kind::none, string_view( formula.data(),
0 ), -1 }
{
assert( m_current.text.data() == formula.data() );
find_next_remaining_token();
}
};
} // namespace chemical_formula
-------------------------------------- main.cpp
#include "chemical_formula-Tokenizer.hpp"
#include <cppx-core/all.hpp>
auto main() -> int
{
$use_std( cout, endl );
const auto& formula = "[Be(N(CH3)24)255555555555555555555]3";
cout << "Tokens:" << endl;
for( auto tokens = chemical_formula::Tokenizer( formula );
not tokens.is_at_end();
tokens.advance() )
{
const chemical_formula::Token tok = tokens.current();
cout << "* ";
if( tok.kind == chemical_formula::Token::Kind::none ) {
cout << "<invalid> ";
}
cout << "\"" << tok.text << "\"" << endl;
}
}
Output:
Tokens:
* "["
* "Be"
* "("
* "N"
* "("
* "C"
* "H"
* "3"
* ")"
* "24"
* ")"
* <invalid> "255555555555555555555"
* "]"
* "3"