reglibcpp  1.6.0
(Naïve) C++ implementation of models for regular languages
expression.cpp
Go to the documentation of this file.
1 #include "expression.h"
3 
4 using std::make_pair;
5 using std::pair;
6 using std::string;
7 using std::vector;
8 using std::unique_ptr;
9 using std::make_unique;
10 using std::u32string;
11 
12 #include <algorithm>
13 
14 #include <array>
15 using std::array;
16 
17 #include <bitset>
18 using std::bitset;
19 
20 #include "gnfa.h"
21 #include "nfa.h"
22 #include "dfa.h"
23 
24 namespace reg {
25 expression::exptr expression::empty = expression::exptr(new expression);
27 
29 
34  return expression::empty;
35 }
36 
38 
43  return expression::spawnSymbol(U'\0');
44 }
45 
47 
52 expression::exptr const& expression::spawnSymbol(char32_t symbol) {
53  return expression::symbols.insert(make_pair(symbol, exptr(new expression(symbol)))).first->second;
54 }
55 
57 expression::exptr const& expression::spawnSymbol(string const& utf8Symbol) {
58  char32_t u32Symbol(converter.from_bytes(utf8Symbol)[0]);
59  return expression::symbols.insert(make_pair(u32Symbol, exptr(new expression(u32Symbol)))).first->second;
60 }
61 
63 
71 expression::exptr expression::spawnConcatenation(expression::exptr const& l, expression::exptr const& r, bool optimized, bool aggressive) {
72  if (optimized) {
73  if (l == expression::spawnEmptyString()) {
74  return r;
75  }
76  if (r == expression::spawnEmptyString()) {
77  return l;
78  }
81  }
82  if (aggressive) {
83  if (*l == *expression::spawnEmptyString()) {
84  return r;
85  }
86  if (*r == *expression::spawnEmptyString()) {
87  return l;
88  }
89  if (*r == *expression::spawnEmptySet() || *l == *expression::spawnEmptySet()) {
91  }
92  }
93  }
94  return expression::exptr(new expression(l, r, operation::concatenation));
95 }
96 
98 
106 expression::exptr expression::spawnAlternation(expression::exptr const& l, expression::exptr const& r, bool optimized, bool aggressive) {
107  if (optimized) {
108  if (l == expression::spawnEmptySet()) {
109  return r;
110  }
111  if (r == expression::spawnEmptySet()) {
112  return l;
113  }
114  if (l == r) {
115  return l;
116  }
117  if (aggressive) {
118  if (*l == *expression::spawnEmptySet()) {
119  return r;
120  }
121  if (*r == *expression::spawnEmptySet()) {
122  return l;
123  }
124  if (*l == *r) {
125  return l->size() > r->size() ? r : l;
126  }
127  }
128  }
129  return expression::exptr(new expression(l, r, operation::alternation));
130 }
131 
133 
140 expression::exptr expression::spawnKleene(expression::exptr const& b, bool optimized, bool aggressive) {
141  if (optimized) {
144  }
145  if (aggressive) {
146  if (*b == *expression::spawnEmptySet() || *b == *expression::spawnEmptyString()) {
148  }
149  if (*b == expression(b)) {
150  return b;
151  }
152  }
153  }
154  return expression::exptr(new expression(b));
155 }
156 
158 
168 size_t expression::size() const {
169  size_t s(op != operation::empty);
170  for (exptr const& re : subExpressions) {
171  s += re->size();
172  }
173  return s;
174 }
175 
177 
182  return op;
183 }
184 
186 
189 bool expression::operator==(expression const& r) const {
190  if (!acceptingDfa) {
191  acceptingDfa = make_unique<dfa const>(dfa::builder(gnfa(*this).splitAllTransitions()).minimize());
192  }
193  if (!r.acceptingDfa) {
194  r.acceptingDfa = make_unique<dfa const>(dfa::builder(gnfa(r).splitAllTransitions()).minimize());
195  }
196  return *acceptingDfa == *r.acceptingDfa;
197 }
198 
200 
203 bool expression::operator!=(expression const& r) const {
204  return !operator==(r);
205 }
206 
208 
212 char32_t expression::extractSymbol() const {
213  auto it = std::find_if(
214  expression::symbols.begin(),
215  expression::symbols.end(),
216  [&](pair<char32_t,exptr> const& entry)->bool{return entry.second.get() == this;}
217  );
218  if (it == expression::symbols.end()) {
219  throw std::logic_error("This RE does not seem to be a valid symbol expression.");
220  }
221  return it->first;
222 }
223 
225 
230  char32_t symbol(extractSymbol());
231  if (symbol == U'\0') {
232  return string();
233  } else {
234  return converter.to_bytes(extractSymbol());
235  }
236 }
237 
240  switch (op) {
241  case operation::alternation : return subExpressions[0]->to_u32string().append(U"+").append(subExpressions[1]->to_u32string());
242  case operation::concatenation : {
243  u32string concat;
244  if (subExpressions[0]->op >= operation::alternation) {
245  concat.append(1, '(');
246  concat.append(subExpressions[0]->to_u32string());
247  concat.append(1, ')');
248  } else {
249  concat.append(subExpressions[0]->to_u32string());
250  }
251  if (subExpressions[1]->op >= operation::alternation) {
252  concat.append(1, '(');
253  concat.append(subExpressions[1]->to_u32string());
254  concat.append(1, ')');
255  } else {
256  concat.append(subExpressions[1]->to_u32string());
257  }
258  return concat;
259  }
260  case operation::kleene : {
261  if (subExpressions[0]->op >= operation::concatenation) {
262  return u32string(1, U'(').append(subExpressions[0]->to_u32string()).append(U")*");
263  } else {
264  return subExpressions[0]->to_u32string().append(1, '*');
265  }
266  }
267  case operation::symbol : {
268  char32_t symbol = extractSymbol();
269  return symbol == U'\0' ? u32string(U"ε") : u32string(1, symbol);
270  }
271  case operation::empty : return u32string(U"∅");
272  default : return u32string();
273  }
274 }
275 
277 string expression::to_string() const {
278  return converter.to_bytes(to_u32string());
279 }
280 
283  return subExpressions.cbegin();
284 }
285 
288  return subExpressions.cend();
289 }
290 
292 
319  enum token : unsigned char {
320  A,
321  B,
322  C,
323  K,
324  E,
325  F,
326  Σ,
327  P,
328  L,
329  R,
330  S,
332  };
333 
336 
339 
343 
344 
346 
352  static tokens getUClosure(tokens const& m) {
353  tokens closure;
354  for (unsigned char c(0); c < token::END; c++) {
355  if (m[c]) {
356  for (token s(static_cast<token>(c)); s != token::END; s = inverseUnitGraph[s]) {
357  closure.set(s);
358  }
359  }
360  } // Going through the tokens in m.
361  return closure;
362  }
363 
365 
371  static bool canDerive(token symbol, tokens const& left, tokens const& right) {
372  tokens leftClosure(getUClosure(left));
373  tokens rightClosure(getUClosure(right));
374  for (unsigned char li(0); li < token::END; li++) {
375  if (leftClosure[li]) {
376  for (unsigned char ri(0); ri < token::END; ri++) {
377  if (rightClosure[ri]) {
378  for (unsigned char ci(0); ci < token::END; ci++) {
379  if (inverseBinaryRules[li][ri][ci]) {
380  for (unsigned char successor(ci); successor != token::END; successor = inverseUnitGraph[successor]) {
381  if (symbol == successor) {
382  return true;
383  }
384  }
385  }
386  }
387  }
388  }
389  }
390  }
391  return false;
392  }
393 
394 
396 
404  static void compileTableEntry(size_t row, size_t diag, vector<vector<tokens>>& table) {
405  for (size_t i(0); i < diag; i++) {
406  tokens first(getUClosure(table[row][i]));
407  for (unsigned char fi(0); fi < first.size(); fi++) {
408  if (first[fi]) {
409  tokens second(getUClosure(table[row+1+i][diag-i-1]));
410  for (unsigned char si(0); si < second.size(); si++) {
411  if (second[si]) {
412  table[row][diag] |= inverseBinaryRules[fi][si];
413  }
414  } // Going through the tokens in second.
415  }
416  } // Going through the tokens in first.
417  }
418  }
419 
422  size_t mutable symbolMappingIndex;
424  bool badRegularExpression = false;
425 
427  parser(u32string const& re, literals const& lits) {
428  if (re.empty()) {
429  badRegularExpression = true;
430  return;
431  }
432  symbolMappingIndex = 0;
433  size_t numberOfTokens(re.length());
434  symbolMapping.reserve(numberOfTokens);
435  table.reserve(numberOfTokens);
436  size_t row(0);
437  for (char32_t symbol : re) {
438  table.push_back(vector<tokens>(numberOfTokens-row, tokens()));
439  if (symbol == lits.L) { table[row][0].set(token::L); }
440  else if (symbol == lits.R) { table[row][0].set(token::R); }
441  else if (symbol == lits.P) { table[row][0].set(token::P); }
442  else if (symbol == lits.S) { table[row][0].set(token::S); }
443  else {
444  table[row][0].set(token::Σ);
445  symbolMapping.push_back(symbol);
446  }
447  row++;
448  }
449  symbolMapping.shrink_to_fit();
450  for (size_t diag(1); diag < numberOfTokens; diag++) {
451  for (size_t row(0); row < numberOfTokens - diag; row++) {
452  compileTableEntry(row, diag, table);
453  }
454  }
455  if (!getUClosure(table[0][table[0].size()-1])[token::A]) {
456  badRegularExpression = true;
457  }
458  }
459 
461  struct tree {
462  parser const* p;
466 
468 
477  for (size_t i = 0; i < diag; i++) {
478  size_t leftDiag = diag-i-1;
479  size_t rightDiag = i;
480  size_t rightRow = diag+row-rightDiag;
481  if (canDerive(symbol, p->table[row][leftDiag], p->table[rightRow][rightDiag])) {
482  return make_pair(make_unique<tree>(row, leftDiag, p), make_unique<tree>(rightRow, rightDiag, p));
483  }
484  }
485  return make_pair(unique_ptr<tree>(), unique_ptr<tree>());
486  }
487 
489 
494  tree(size_t row, size_t diag, parser const* p) :
495  p(p),
496  symbol([&]()->token{
497  for (unsigned char i(token::END); i > 0; i--) {
498  if (p->table[row][diag][i-1]) {
499  return static_cast<token>(i-1);
500  }
501  }
502  return token::END;
503  }()),
504  children(findNextPair(symbol, row, diag, p)) {}
505 
507  exptr operator()(bool optimized, bool aggressive) {
508  switch (symbol) {
509  case token::A:
510  return spawnAlternation((*children.first)(optimized, aggressive), (*children.second)(optimized, aggressive), optimized, aggressive);
511  case token::B:
512  return (*children.second)(optimized, aggressive);
513  case token::C:
514  return spawnConcatenation((*children.first)(optimized, aggressive), (*children.second)(optimized, aggressive), optimized, aggressive);
515  case token::K:
516  return spawnKleene((*children.first)(optimized, aggressive), optimized, aggressive);
517  case token::E:
518  return (*children.second)(optimized, aggressive);
519  case token::F:
520  return (*children.first)(optimized, aggressive);
521  case token::Σ: {
522  char32_t symbol = p->symbolMapping[p->symbolMappingIndex++];
523  if (p->symbolMappingIndex >= p->symbolMapping.size()) { p->symbolMappingIndex = 0; }
524  if (symbol == p->lits.EPSILON) { return spawnEmptyString(); }
525  else if (symbol == p->lits.EMPTY) { return spawnEmptySet(); }
526  else { return spawnSymbol(symbol); }
527  }
528  default:
529  return exptr();
530  }
531  }
532  };
533 
535 
539  exptr operator()(bool optimized, bool aggressive) {
540  if (badRegularExpression) {
541  throw std::bad_function_call();
542  }
543  return tree(0, table.size()-1, this)(optimized, aggressive);
544  }
545 };
546 
550  graph.fill(token::END);
551  graph[token::Σ] = token::E;
552  graph[token::E] = token::K;
553  graph[token::K] = token::C;
554  graph[token::C] = token::A;
555  return graph;
556 }();
557 
558 
560  []()->array<array<expression::parser::tokens,expression::parser::token::END>,expression::parser::token::END>{
561  array<tokens,token::END> noPredecessor;
562  noPredecessor.fill(tokens());
563  array<array<tokens,token::END>,token::END> rules;
564  rules.fill(noPredecessor);
565  rules[token::A][token::B].set(token::A);
566  rules[token::P][token::C].set(token::B);
567  rules[token::C][token::K].set(token::C);
568  rules[token::E][token::S].set(token::K);
569  rules[token::L][token::F].set(token::E);
570  rules[token::A][token::R].set(token::F);
571  return rules;
572 }();
573 
575 
583 expression::exptr expression::spawnFromString(std::u32string const& re, literals lits, bool optimized, bool aggressive) {
584  parser stringParser(re, lits);
585  try {
586  return stringParser(optimized, aggressive);
587  } catch (std::bad_function_call e) {
588  throw std::invalid_argument("Malformed regular expression.");
589  }
590 }
591 
592 
594 expression::exptr expression::spawnFromString(string const& utf8Re, literals lits, bool optimized, bool aggressive) {
595  return spawnFromString(converter.from_bytes(utf8Re), lits, optimized, aggressive);
596 }
597 
598 expression::expression() : op(operation::empty) {}
599 expression::expression(char32_t symbol) : op(operation::symbol) {}
600 expression::expression(exptr const& l, exptr const& r, operation op) : subExpressions({l, r}), op(op) {}
601 expression::expression(exptr const& b) : subExpressions({b}), op(operation::kleene) {}
602 }
Number of elements in this enumeration, NOT AN ACTUAL TOKEN!
Definition: expression.cpp:331
Represents the table entries as binary trees.
Definition: expression.cpp:461
static array< array< tokens, token::END >, token::END > const inverseBinaryRules
Maps pairs of symbols to the symbols that derive them.
Definition: expression.cpp:341
An alternation symbol.
Definition: expression.cpp:327
char32_t const S
The Kleene star.
Definition: expression.h:49
static bool canDerive(token symbol, tokens const &left, tokens const &right)
Checks if a token could derive a pair of tokens from two other entries.
Definition: expression.cpp:371
A concatenation expression.
Definition: expression.cpp:322
static tokens getUClosure(tokens const &m)
Constructs the reflexive-transitive closure of the inverse unit relation for a given set of symbols...
Definition: expression.cpp:352
pair< unique_ptr< tree >, unique_ptr< tree > > findNextPair(token symbol, size_t row, size_t diag, parser const *p)
Finds the child trees that can be derived from a given entry.
Definition: expression.cpp:476
token symbol
This tree&#39;s root symbol.
Definition: expression.cpp:463
A left parenthesis.
Definition: expression.cpp:328
bool operator!=(expression const &r) const
Checks whether this RE is semantically different from another one.
Definition: expression.cpp:203
Token literals as used in Introduction to Automata Theory, Languages, and Computation by Hopcroft...
Definition: expression.h:48
char32_t const EMPTY
Neutral element of alternation and annihilating element of concatenation, a.k.a. empty set...
Definition: expression.h:49
std::vector< exptr >::const_iterator begin() const
Returns an iterator pointing to this RE's first subexpression.
Definition: expression.cpp:282
static exptr const & spawnEmptyString()
Gives an RE representing the empty string ε.
Definition: expression.cpp:42
nfa splitAllTransitions()
Splits all transitions until only ∅, ε, and symbol REs remain and builds the resulting NFA...
Definition: gnfa.cpp:287
std::wstring_convert< std::codecvt_utf8< char32_t >, char32_t > converter
Converts between UTF-8-encoded and UTF-32-encoded strings.
Definition: dfa.cpp:1060
vector< vector< tokens > > table
The table of sets of symbols that derive a subsentence.
Definition: expression.cpp:423
builder & minimize()
Convenience method for chaining purge and merge to achieve proper minimization.
Definition: dfa.cpp:844
exptr operator()(bool optimized, bool aggressive)
Gives the RE encoded in this tree.
Definition: expression.cpp:507
static exptr spawnFromString(std::u32string const &re, literals lits=literals(), bool optimized=false, bool aggressive=false)
Gives an RE encoded in a given string.
Definition: expression.cpp:583
static exptr spawnAlternation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the alternation of two given REs.
Definition: expression.cpp:106
Represents formal regular expressions.
Definition: expression.h:27
Beginning of a new subexpression.
Definition: expression.cpp:324
std::u32string to_u32string() const
Describes this RE in UTF-32-encoded human-readable form.
Definition: expression.cpp:239
bool badRegularExpression
Signifies that the RE used to initialize this object was invalid.
Definition: expression.cpp:424
char32_t extractSymbol() const
Reports this symbol expression's UTF-32-encoded symbol.
Definition: expression.cpp:212
literals lits
Stores interpretations for characters encountered in the parsed string.
Definition: expression.cpp:420
A symbol expression.
Definition: expression.cpp:326
static exptr const & spawnSymbol(char32_t symbol)
Gives an RE representing the given UTF-32-encoded symbol.
Definition: expression.cpp:52
pair< unique_ptr< tree >, unique_ptr< tree > > children
Trees with the symbols of the entry's derived pair as root.
Definition: expression.cpp:464
vector< char32_t > symbolMapping
Stores the actual symbols encountered in the RE while parsing.
Definition: expression.cpp:421
operation
The different purposes an RE may fulfill.
Definition: expression.h:79
Parses regular expressions.
Definition: expression.cpp:317
Contains the reg::nfa class definition.
A Kleene star symbol.
Definition: expression.cpp:330
token
Tokens the grammar deals with.
Definition: expression.cpp:319
operation getOperation() const
Reports this RE's function.
Definition: expression.cpp:181
Contains the reg::expression class defintion.
std::vector< exptr >::const_iterator end() const
Returns an iterator pointing behind this RE's last subexpression.
Definition: expression.cpp:287
Represents generalized nondeterministic finite automata.
Definition: gnfa.h:18
std::string to_string() const
Describes this RE in UTF-8-encoded human-readable form.
Definition: expression.cpp:277
size_t symbolMappingIndex
Index for when symbols have to be extracted from the mapping.
Definition: expression.cpp:422
Second part of an alternation expression.
Definition: expression.cpp:321
bool operator==(expression const &r) const
Checks whether this RE is semantically equivalent to another one.
Definition: expression.cpp:189
char32_t const L
The left parenthesis.
Definition: expression.h:49
A right parenthesis.
Definition: expression.cpp:329
Constructs DFAs step by step.
Definition: dfa.h:72
parser const * p
Points to the parser this tree belongs to.
Definition: expression.cpp:462
Beginning of an alternation expression.
Definition: expression.cpp:320
Contains the reg::gnfa class definition.
char32_t const P
The alternation symbol.
Definition: expression.h:49
size_t size() const
Reports the size of this RE's tree representation.
Definition: expression.cpp:168
Where this library lives.
Definition: dfa.cpp:51
static void compileTableEntry(size_t row, size_t diag, vector< vector< tokens >> &table)
Fills a table entry.
Definition: expression.cpp:404
char32_t const EPSILON
Neutral element of concatenation, a.k.a. empty string.
Definition: expression.h:49
Kleene expression.
Definition: expression.cpp:323
bitset< token::END > tokens
Tokens don't usually come alone.
Definition: expression.cpp:335
parser(u32string const &re, literals const &lits)
Initializes with a string to parse and literals to parse for.
Definition: expression.cpp:427
exptr operator()(bool optimized, bool aggressive)
Gives the RE resulting from parsing.
Definition: expression.cpp:539
tree(size_t row, size_t diag, parser const *p)
Initializes a tree with a given table entry as root.
Definition: expression.cpp:494
std::shared_ptr< expression const > exptr
This is the type used to handle regular expressions.
Definition: expression.h:39
static exptr const & spawnEmptySet()
Gives an RE representing the empty set ∅.
Definition: expression.cpp:33
char32_t const R
The right parenthesis.
Definition: expression.h:49
Contains the reg::dfa class definition.
std::string extractUtf8Symbol() const
Reports this symbol expression's UTF-8-encoded symbol.
Definition: expression.cpp:229
static array< token, token::END > const inverseUnitGraph
Maps symbols that may be derived in-place to their predecessing symbols.
Definition: expression.cpp:337
static exptr spawnKleene(exptr const &b, bool optimized=true, bool aggressive=false)
Gives an RE representing the Kleene closure of a given RE.
Definition: expression.cpp:140
static exptr spawnConcatenation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the concatenation of two given REs.
Definition: expression.cpp:71
Second part of a new subexpression.
Definition: expression.cpp:325