reglibcpp  1.7.0
(Naïve) C++ implementation of models for regular languages
expression.cpp
Go to the documentation of this file.
1 #include "expression.h"
3 
4 using std::make_pair;
5 using std::pair;
6 using std::string;
7 using std::vector;
8 using std::unique_ptr;
9 using std::make_unique;
10 using std::u32string;
11 
12 #include <algorithm>
13 
14 #include <array>
15 using std::array;
16 
17 #include <bitset>
18 using std::bitset;
19 
20 #include "gnfa.h"
21 #include "nfa.h"
22 #include "dfa.h"
23 
24 namespace reg {
25 expression::exptr expression::empty = expression::exptr(new expression);
27 
28 char32_t expression::L = U'(';
29 char32_t expression::R = U')';
30 char32_t expression::K = U'*';
31 char32_t expression::A = U'+';
32 char32_t expression::E = U'ε';
33 char32_t expression::N = U'∅';
34 
37  L = U'(';
38  R = U')';
39  K = U'*';
40  A = U'+';
41  E = U'ε';
42  N = U'∅';
43 }
44 
46 
51  return expression::empty;
52 }
53 
55 
60  return expression::spawnSymbol(U'\0');
61 }
62 
64 
69 expression::exptr const& expression::spawnSymbol(char32_t symbol) {
70  return expression::symbols.insert(make_pair(symbol, exptr(new expression(symbol)))).first->second;
71 }
72 
74 expression::exptr const& expression::spawnSymbol(string const& utf8Symbol) {
75  char32_t u32Symbol(converter.from_bytes(utf8Symbol)[0]);
76  return expression::symbols.insert(make_pair(u32Symbol, exptr(new expression(u32Symbol)))).first->second;
77 }
78 
80 
88 expression::exptr expression::spawnConcatenation(expression::exptr const& l, expression::exptr const& r, bool optimized, bool aggressive) {
89  if (optimized) {
90  if (l == expression::spawnEmptyString()) {
91  return r;
92  }
93  if (r == expression::spawnEmptyString()) {
94  return l;
95  }
98  }
99  if (aggressive) {
100  if (*l == *expression::spawnEmptyString()) {
101  return r;
102  }
103  if (*r == *expression::spawnEmptyString()) {
104  return l;
105  }
106  if (*r == *expression::spawnEmptySet() || *l == *expression::spawnEmptySet()) {
107  return expression::spawnEmptySet();
108  }
109  }
110  }
111  return expression::exptr(new expression(l, r, operation::concatenation));
112 }
113 
115 
123 expression::exptr expression::spawnAlternation(expression::exptr const& l, expression::exptr const& r, bool optimized, bool aggressive) {
124  if (optimized) {
125  if (l == expression::spawnEmptySet()) {
126  return r;
127  }
128  if (r == expression::spawnEmptySet()) {
129  return l;
130  }
131  if (l == r) {
132  return l;
133  }
134  if (aggressive) {
135  if (*l == *expression::spawnEmptySet()) {
136  return r;
137  }
138  if (*r == *expression::spawnEmptySet()) {
139  return l;
140  }
141  if (*l == *r) {
142  return l->size() > r->size() ? r : l;
143  }
144  dfa empty;
145  if (empty == nfa::subtract(*l, *r).build()) {
146  return r;
147  }
148  if (empty == nfa::subtract(*r, *l).build()) {
149  return l;
150  }
151  }
152  }
153  return expression::exptr(new expression(l, r, operation::alternation));
154 }
155 
157 
164 expression::exptr expression::spawnKleene(expression::exptr const& b, bool optimized, bool aggressive) {
165  if (optimized) {
168  }
169  if (aggressive) {
170  if (*b == *expression::spawnEmptySet() || *b == *expression::spawnEmptyString()) {
172  }
173  if (*b == expression(b)) {
174  return b;
175  }
176  }
177  }
178  return expression::exptr(new expression(b));
179 }
180 
182 
192 size_t expression::size() const {
193  size_t s(1);
194  for (exptr const& re : *this) {
195  s += re->size();
196  }
197  return s;
198 }
199 
201 
206  return op;
207 }
208 
209 #ifndef DOXYGEN_SHOULD_SKIP_THIS
210 expression::operator nfa const&() const {
211  if (!acceptingNfa) {
212  acceptingNfa = make_unique<nfa const>(gnfa(*this).splitAllTransitions());
213  }
214  return *acceptingNfa;
215 }
216 #endif // DOXYGEN_SHOULD_SKIP_THIS
217 
219 
222 bool expression::operator==(nfa const& other) const {
223  return other.operator==(*this);
224 }
225 
227 
230 bool expression::operator!=(nfa const& other) const {
231  return !operator==(other);
232 }
233 
235 bool expression::operator==(expression const& other) const {
236  return *this == static_cast<nfa const&>(other);
237 }
238 
240 bool expression::operator!=(expression const& other) const {
241  return *this != static_cast<nfa const&>(other);
242 }
243 
245 
249 char32_t expression::extractSymbol() const {
250  auto it = std::find_if(
251  expression::symbols.begin(),
252  expression::symbols.end(),
253  [&](pair<char32_t,exptr> const& entry)->bool{return entry.second.get() == this;}
254  );
255  if (it == expression::symbols.end()) {
256  throw std::logic_error("This RE does not seem to be a valid symbol expression.");
257  }
258  return it->first;
259 }
260 
262 
267  char32_t symbol(extractSymbol());
268  if (symbol == U'\0') {
269  return string();
270  } else {
271  return converter.to_bytes(extractSymbol());
272  }
273 }
274 
277  switch (op) {
278  case operation::alternation : return subExpressions[0]->to_u32string() + A + subExpressions[1]->to_u32string();
279  case operation::concatenation : {
280  u32string concat;
281  if (subExpressions[0]->op >= operation::alternation) {
282  concat.append(L + subExpressions[0]->to_u32string() + R);
283  } else {
284  concat.append(subExpressions[0]->to_u32string());
285  }
286  if (subExpressions[1]->op >= operation::alternation) {
287  concat.append(L + subExpressions[1]->to_u32string() + R);
288  } else {
289  concat.append(subExpressions[1]->to_u32string());
290  }
291  return concat;
292  }
293  case operation::kleene : {
294  if (subExpressions[0]->op >= operation::concatenation) {
295  return (L + subExpressions[0]->to_u32string() + R) + K;
296  } else {
297  return subExpressions[0]->to_u32string() + K;
298  }
299  }
300  case operation::symbol : {
301  char32_t symbol = extractSymbol();
302  return u32string(1, symbol + (!symbol * E));
303  }
304  case operation::empty : return u32string(1, N);
305  default : return u32string();
306  }
307 }
308 
310 string expression::to_string() const {
311  return converter.to_bytes(to_u32string());
312 }
313 
316  return subExpressions.cbegin();
317 }
318 
321  return subExpressions.cend();
322 }
323 
325 
352  enum struct token : unsigned char {
353  A,
354  B,
355  C,
356  K,
357  E,
358  F,
359  Σ,
360  P,
361  L,
362  R,
363  S,
364  END
365  };
366 
368  #define TOKEN(T) static_cast<unsigned char>(reg::expression::parser::token::T)
369 
372 
375 
379 
380 
382 
388  static tokens getUClosure(tokens const& m) {
389  tokens closure;
390  for (unsigned char c(0); c < TOKEN(END); c++) {
391  if (m[c]) {
392  for (token s(static_cast<token>(c)); s != token::END; s = inverseUnitGraph[static_cast<unsigned char>(s)]) {
393  closure.set(static_cast<unsigned char>(s));
394  }
395  }
396  } // Going through the tokens in m.
397  return closure;
398  }
399 
401 
407  static bool canDerive(token symbol, tokens const& left, tokens const& right) {
408  tokens leftClosure(getUClosure(left));
409  tokens rightClosure(getUClosure(right));
410  for (unsigned char li(0); li < TOKEN(END); li++) {
411  if (leftClosure[li]) {
412  for (unsigned char ri(0); ri < TOKEN(END); ri++) {
413  if (rightClosure[ri]) {
414  for (unsigned char ci(0); ci < TOKEN(END); ci++) {
415  if (inverseBinaryRules[li][ri][ci]) {
416  for (unsigned char successor(ci); successor != TOKEN(END); successor = static_cast<unsigned char>(inverseUnitGraph[successor])) {
417  if (static_cast<unsigned char>(symbol) == successor) {
418  return true;
419  }
420  }
421  }
422  }
423  }
424  }
425  }
426  }
427  return false;
428  }
429 
431 
439  static void compileTableEntry(size_t row, size_t diag, vector<vector<tokens>>& table) {
440  for (size_t i(0); i < diag; i++) {
441  tokens first(getUClosure(table[row][i]));
442  for (unsigned char fi(0); fi < first.size(); fi++) {
443  if (first[fi]) {
444  tokens second(getUClosure(table[row+1+i][diag-i-1]));
445  for (unsigned char si(0); si < second.size(); si++) {
446  if (second[si]) {
447  table[row][diag] |= inverseBinaryRules[fi][si];
448  }
449  } // Going through the tokens in second.
450  }
451  } // Going through the tokens in first.
452  }
453  }
454 
456  size_t mutable symbolMappingIndex;
458  bool badRegularExpression = false;
459 
461  parser(u32string const& re) {
462  if (re.empty()) {
463  badRegularExpression = true;
464  return;
465  }
466  symbolMappingIndex = 0;
467  size_t numberOfTokens(re.length());
468  symbolMapping.reserve(numberOfTokens);
469  table.reserve(numberOfTokens);
470  size_t row(0);
471  for (char32_t symbol : re) {
472  table.push_back(vector<tokens>(numberOfTokens-row, tokens()));
473  if (symbol == L) { table[row][0].set(TOKEN(L)); }
474  else if (symbol == R) { table[row][0].set(TOKEN(R)); }
475  else if (symbol == A) { table[row][0].set(TOKEN(P)); }
476  else if (symbol == K) { table[row][0].set(TOKEN(S)); }
477  else {
478  table[row][0].set(TOKEN(Σ));
479  symbolMapping.push_back(symbol);
480  }
481  row++;
482  }
483  symbolMapping.shrink_to_fit();
484  for (size_t diag(1); diag < numberOfTokens; diag++) {
485  for (size_t row(0); row < numberOfTokens - diag; row++) {
486  compileTableEntry(row, diag, table);
487  }
488  }
489  if (!getUClosure(table[0][table[0].size()-1])[TOKEN(A)]) {
490  badRegularExpression = true;
491  }
492  }
493 
495  struct tree {
496  parser const* p;
500 
502 
511  for (size_t i = 0; i < diag; i++) {
512  size_t leftDiag = diag-i-1;
513  size_t rightDiag = i;
514  size_t rightRow = diag+row-rightDiag;
515  if (canDerive(symbol, p->table[row][leftDiag], p->table[rightRow][rightDiag])) {
516  return make_pair(make_unique<tree>(row, leftDiag, p), make_unique<tree>(rightRow, rightDiag, p));
517  }
518  }
519  return make_pair(unique_ptr<tree>(), unique_ptr<tree>());
520  }
521 
523 
528  tree(size_t row, size_t diag, parser const* p) :
529  p(p),
530  symbol([&]()->token{
531  for (unsigned char i(TOKEN(END)); i > 0; i--) {
532  if (p->table[row][diag][i-1]) {
533  return static_cast<token>(i-1);
534  }
535  }
536  return token::END;
537  }()),
538  children(findNextPair(symbol, row, diag, p)) {}
539 
541  exptr operator()(bool optimized, bool aggressive) {
542  switch (symbol) {
543  case token::A:
544  return spawnAlternation((*children.first)(optimized, aggressive), (*children.second)(optimized, aggressive), optimized, aggressive);
545  case token::B:
546  return (*children.second)(optimized, aggressive);
547  case token::C:
548  return spawnConcatenation((*children.first)(optimized, aggressive), (*children.second)(optimized, aggressive), optimized, aggressive);
549  case token::K:
550  return spawnKleene((*children.first)(optimized, aggressive), optimized, aggressive);
551  case token::E:
552  return (*children.second)(optimized, aggressive);
553  case token::F:
554  return (*children.first)(optimized, aggressive);
555  case token::Σ: {
556  char32_t symbol = p->symbolMapping[p->symbolMappingIndex++];
557  if (p->symbolMappingIndex >= p->symbolMapping.size()) { p->symbolMappingIndex = 0; }
558  if (symbol == E) { return spawnEmptyString(); }
559  else if (symbol == N) { return spawnEmptySet(); }
560  else { return spawnSymbol(symbol); }
561  }
562  default:
563  return exptr();
564  }
565  }
566  };
567 
569 
573  exptr operator()(bool optimized, bool aggressive) {
574  if (badRegularExpression) {
575  throw std::bad_function_call();
576  }
577  return tree(0, table.size()-1, this)(optimized, aggressive);
578  }
579 };
580 
584  graph.fill(token::END);
585  graph[TOKEN(Σ)] = token::E;
586  graph[TOKEN(E)] = token::K;
587  graph[TOKEN(K)] = token::C;
588  graph[TOKEN(C)] = token::A;
589  return graph;
590 }();
591 
594  array<tokens,TOKEN(END)> noPredecessor;
595  noPredecessor.fill(tokens());
597  rules.fill(noPredecessor);
598  rules[TOKEN(A)][TOKEN(B)].set(TOKEN(A));
599  rules[TOKEN(P)][TOKEN(C)].set(TOKEN(B));
600  rules[TOKEN(C)][TOKEN(K)].set(TOKEN(C));
601  rules[TOKEN(E)][TOKEN(S)].set(TOKEN(K));
602  rules[TOKEN(L)][TOKEN(F)].set(TOKEN(E));
603  rules[TOKEN(A)][TOKEN(R)].set(TOKEN(F));
604  return rules;
605 }();
606 #undef TOKEN
607 
609 expression::exptr expression::spawnFromString(std::u32string const& re, literals lits, bool optimized, bool aggressive) {
610  char32_t l = L;
611  char32_t r = R;
612  char32_t k = K;
613  char32_t a = A;
614  char32_t e = E;
615  char32_t n = N;
616  L = lits.L;
617  R = lits.R;
618  K = lits.S;
619  A = lits.P;
620  E = lits.EPSILON;
621  N = lits.EMPTY;
622  parser stringParser(re);
623  exptr result;
624  try {
625  result = stringParser(optimized, aggressive);
626  L = l;
627  R = r;
628  K = k;
629  A = a;
630  E = e;
631  N = n;
632  } catch (std::bad_function_call ex) {
633  L = l;
634  R = r;
635  K = k;
636  A = a;
637  E = e;
638  N = n;
639  throw std::invalid_argument("Malformed regular expression.");
640  }
641  return result;
642 }
643 
645 expression::exptr expression::spawnFromString(string const& utf8Re, literals lits, bool optimized, bool aggressive) {
646  return spawnFromString(converter.from_bytes(utf8Re), lits, optimized, aggressive);
647 }
648 
650 
657 expression::exptr expression::spawnFromString(std::u32string const& re, bool optimized, bool aggressive) {
658  parser stringParser(re);
659  try {
660  return stringParser(optimized, aggressive);
661  } catch (std::bad_function_call e) {
662  throw std::invalid_argument("Malformed regular expression.");
663  }
664 }
665 
667 expression::exptr expression::spawnFromString(string const& utf8Re, bool optimized, bool aggressive) {
668  return spawnFromString(converter.from_bytes(utf8Re), optimized, aggressive);
669 }
670 
671 expression::expression() : op(operation::empty) {}
672 expression::expression(char32_t symbol) : op(operation::symbol) {}
673 expression::expression(exptr const& l, exptr const& r, operation op) : subExpressions({l, r}), op(op) {}
674 expression::expression(exptr const& b) : subExpressions({b}), op(operation::kleene) {}
675 }
Represents the table entries as binary trees.
Definition: expression.cpp:495
static array< array< tokens, TOKEN(END)>, TOKEN(END)> const inverseBinaryRules
Maps pairs of symbols to the symbols that derive them.
Definition: expression.cpp:377
char32_t const S
The Kleene star.
Definition: expression.h:52
static bool canDerive(token symbol, tokens const &left, tokens const &right)
Checks if a token could derive a pair of tokens from two other entries.
Definition: expression.cpp:407
Represents nondeterministic finite automata with ε-moves.
Definition: nfa.h:23
static tokens getUClosure(tokens const &m)
Constructs the reflexive-transitive closure of the inverse unit relation for a given set of symbols...
Definition: expression.cpp:388
pair< unique_ptr< tree >, unique_ptr< tree > > findNextPair(token symbol, size_t row, size_t diag, parser const *p)
Finds the child trees that can be derived from a given entry.
Definition: expression.cpp:510
token symbol
This tree&#39;s root symbol.
Definition: expression.cpp:497
static char32_t N
The symbol used to represent the Null/empty set in a regular expression.
Definition: expression.h:41
Represents deterministic finite automata.
Definition: dfa.h:21
char32_t const EMPTY
Neutral element of alternation and annihilating element of concatenation, a.k.a. empty set...
Definition: expression.h:52
bool operator==(nfa const &other) const
Checks whether this RE describes the same regular language as another object.
Definition: expression.cpp:222
std::vector< exptr >::const_iterator begin() const
Returns an iterator pointing to this RE's first subexpression.
Definition: expression.cpp:315
Beginning of a new subexpression.
static exptr const & spawnEmptyString()
Gives an RE representing the empty string ε.
Definition: expression.cpp:59
nfa splitAllTransitions()
Splits all transitions until only ∅, ε, and symbol REs remain and builds the resulting NFA...
Definition: gnfa.cpp:324
#define TOKEN(T)
Gives casting to base type back to scoped enums, as God intended.
Definition: expression.cpp:368
bool operator!=(nfa const &other) const
Checks whether this RE describes a different regular language than another object.
Definition: expression.cpp:230
std::wstring_convert< std::codecvt_utf8< char32_t >, char32_t > converter
Converts between UTF-8-encoded and UTF-32-encoded strings.
Definition: utils.cpp:4
vector< vector< tokens > > table
The table of sets of symbols that derive a subsentence.
Definition: expression.cpp:457
exptr operator()(bool optimized, bool aggressive)
Gives the RE encoded in this tree.
Definition: expression.cpp:541
static exptr spawnAlternation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the alternation of two given REs.
Definition: expression.cpp:123
Second part of an alternation expression.
Represents formal regular expressions.
Definition: expression.h:28
std::u32string to_u32string() const
Describes this RE in UTF-32-encoded human-readable form.
Definition: expression.cpp:276
bool badRegularExpression
Signifies that the RE used to initialize this object was invalid.
Definition: expression.cpp:458
char32_t extractSymbol() const
Reports this symbol expression's UTF-32-encoded symbol.
Definition: expression.cpp:249
static exptr const & spawnSymbol(char32_t symbol)
Gives an RE representing the given UTF-32-encoded symbol.
Definition: expression.cpp:69
pair< unique_ptr< tree >, unique_ptr< tree > > children
Trees with the symbols of the entry's derived pair as root.
Definition: expression.cpp:498
vector< char32_t > symbolMapping
Stores the actual symbols encountered in the RE while parsing.
Definition: expression.cpp:455
operation
The different purposes an RE may fulfill.
Definition: expression.h:84
Parses regular expressions.
Definition: expression.cpp:350
Contains the reg::nfa class definition.
token
Tokens the grammar deals with.
Definition: expression.cpp:352
operation getOperation() const
Reports this RE's function.
Definition: expression.cpp:205
Contains the reg::expression class defintion.
std::vector< exptr >::const_iterator end() const
Returns an iterator pointing behind this RE's last subexpression.
Definition: expression.cpp:320
bitset< TOKEN(END)> tokens
Tokens don't usually come alone.
Definition: expression.cpp:371
static char32_t K
The symbol used to represent the Kleene star in a regular expression.
Definition: expression.h:41
Represents generalized nondeterministic finite automata.
Definition: gnfa.h:17
std::string to_string() const
Describes this RE in UTF-8-encoded human-readable form.
Definition: expression.cpp:310
size_t symbolMappingIndex
Index for when symbols have to be extracted from the mapping.
Definition: expression.cpp:456
char32_t const L
The left parenthesis.
Definition: expression.h:52
parser const * p
Points to the parser this tree belongs to.
Definition: expression.cpp:496
static nfa::builder subtract(nfa const &n1, nfa const &n2)
Creates a builder for an NFA accepting the set difference of the languages of two NFAs...
Definition: nfa.cpp:604
Contains the reg::gnfa class definition.
parser(u32string const &re)
Initializes with a string to parse.
Definition: expression.cpp:461
Number of elements in this enumeration, NOT AN ACTUAL TOKEN!
char32_t const P
The alternation symbol.
Definition: expression.h:52
size_t size() const
Reports the size of this RE's tree representation.
Definition: expression.cpp:192
static exptr spawnFromString(std::u32string const &re, literals lits, bool optimized=false, bool aggressive=false)
Definition: expression.cpp:609
static char32_t L
The symbol used to represent the Left parenthesis in a regular expression.
Definition: expression.h:41
A Kleene star symbol.
Where this library lives.
Definition: dfa.cpp:51
static void compileTableEntry(size_t row, size_t diag, vector< vector< tokens >> &table)
Fills a table entry.
Definition: expression.cpp:439
char32_t const EPSILON
Neutral element of concatenation, a.k.a. empty string.
Definition: expression.h:52
Second part of a new subexpression.
static void reset()
Resets the symbols used for RE operators to their defaults.
Definition: expression.cpp:36
static char32_t E
The symbol used to represent the Empty string in a regular expression.
Definition: expression.h:41
exptr operator()(bool optimized, bool aggressive)
Gives the RE resulting from parsing.
Definition: expression.cpp:573
tree(size_t row, size_t diag, parser const *p)
Initializes a tree with a given table entry as root.
Definition: expression.cpp:528
Beginning of an alternation expression.
std::shared_ptr< expression const > exptr
This is the type used to handle regular expressions.
Definition: expression.h:40
static exptr const & spawnEmptySet()
Gives an RE representing the empty set ∅.
Definition: expression.cpp:50
An alternation symbol.
char32_t const R
The right parenthesis.
Definition: expression.h:52
Contains the reg::dfa class definition.
std::string extractUtf8Symbol() const
Reports this symbol expression's UTF-8-encoded symbol.
Definition: expression.cpp:266
static array< token, TOKEN(END)> const inverseUnitGraph
Maps symbols that may be derived in-place to their predecessing symbols.
Definition: expression.cpp:373
static char32_t R
The symbol used to represent the Right parenthesis in a regular expression.
Definition: expression.h:41
static exptr spawnKleene(exptr const &b, bool optimized=true, bool aggressive=false)
Gives an RE representing the Kleene closure of a given RE.
Definition: expression.cpp:164
static exptr spawnConcatenation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the concatenation of two given REs.
Definition: expression.cpp:88
static char32_t A
The symbol used to represent the Alternation in a regular expression.
Definition: expression.h:41
A concatenation expression.