从文法到解析器的所有算法
最近完成了替代Lex+YACC的自动生成词法分析器+语法分析器的项目,暂且命名为YAC。想拥有自己的解析器的小伙伴可以将文法给我,送解析器。
下面是一个支持加减乘除和括号的四则运算的文法:
Calc.st
syntax {
Additive : Additive '+' Multiplicative
| Additive '-' Multiplicative
| Multiplicative ;
Multiplicative : Multiplicative '*' Primary
| Multiplicative '/' Primary
| Primary ;
Primary : '(' Additive ')'
| 'number' ;
}
//--------------------------------------------
lexi {
'number' : %%[0-9]+%% ; // 示例只处理非负整数
}
options {
'GrammarName' : Calc ;
'ExtractedType' : FinalValue ;
'blockComment' : on ; // 支持多行注释
'inlineComment' : on ; // 支持单行注释
}
编译原理中的语法分析器
根据文法Grammar中的syntax { .. }
部分生成语法分析器,其基本原理是:用产生式描述解析器中的语法结构,分别根据LL(1)、LR(0)、SLR(1)、LALR(1)、LR(1)分析法,将其逐步转换为语法分析器代码。
基础结构
NullableDict
判断一个VNode数组是否可能产生空empty(即什么都没有推导出来)?
NullableDict
查看代码
// 计算所有可能推导出empty的结点。
/// get the dictionary that tells if a node can refer to empty.
public static Dictionary<VNode, bool> GetNullableDict(this IReadOnlyList<RegulationDraft> regulationDrafts) {
var nullableDict = new Dictionary<VNode, bool>();
// allocate space for all kinds of nodes(Vt and Vn).
var allNodeTypes = Algo.GetNodes(regulationDrafts);
foreach (var item in allNodeTypes) {
nullableDict.Add(item, false);
}
// iterate untill not changed.
bool changed = false;
do {
changed = false;
foreach (var regulation in regulationDrafts) {
// 如果regulation.right可推导出empty,就说明regulation.left可推导出empty。
if (CanBeEmpty(regulation.right, nullableDict)) {
var left = regulation.left;
if (!nullableDict[left]) {
nullableDict[left] = true;
changed = true;
}
}
}
} while (changed);
return nullableDict;
}
// nodeList是否都能产生empty?
// can the nodeList be empty ?
static bool CanBeEmpty(IReadOnlyList<VNode> nodeList, Dictionary<VNode, bool> nullableDict) {
return CanBeEmpty(nodeList, 0, nodeList.Count, nullableDict);
}
// nodeList中指定的某一段结点是否都能产生empty?
// can the specified part of nodeList(from checkIndex to (checkIndex + checkCount - 1)) be empty ?
static bool CanBeEmpty(IReadOnlyList<VNode> nodeList, int checkIndex, int checkCount, Dictionary<VNode, bool> nullableDict) {
bool result = true;
for (int i = 0; i < checkCount; i++) {
var node = nodeList[i + checkIndex];
if (!nullableDict[node]) {
result = false;
break;
}
}
return result;
}
FIRST
给定VNode数组,在它能推导出的所有产生式中,第一个VNode都有哪些?
所有的Vt,其FIRST都是Vt自己。以此为基础,逐步找到全部FIRST。
GetFIRSTDict
计算文法的FIRST集
// returns the dictionary of FIRST.strKey -> FIRST
public static Dictionary<VNodesKey, FIRST> GetFIRSTDict(
IReadOnlyList<RegulationDraft> regulations,
Dictionary<VNode, bool> nullableDict) {
var result = new Dictionary<VNodesKey, FIRST>();
GetFIRSTDict4Node(regulations, result, nullableDict);
GetFIRSTDict4Right(regulations, result, nullableDict);
return result;
}
// 计算文法的各个regulation.right的FIRST集
private static void GetFIRSTDict4Right(
IReadOnlyList<RegulationDraft> regulations,
Dictionary<VNodesKey, FIRST> result, Dictionary<VNode, bool> emptyDict) {
// allocate space for all regulation.right
foreach (var regulation in regulations) {
var key = new VNodesKey(regulation.right);
var first = new FIRST(key); // not filled up yet.
if (!result.TryGetValue(first.key, out var _)) {
result.Add(first.key, first);
}
}
bool changed = false;
do {
changed = false;
foreach (var first in result.Values) {
var key = first.key;
int count = key.Length;
for (int checkLength = 0; checkLength < count; checkLength++) {
// 如果前checkLength个结点都可为empty,
// 就说明 FIRST( left ) 包含 FIRST( key[checkpoint] ),empty除外。
// if key[(-1)->(checkpoint-1)] can be empty,
// then FIRST( left ) includes FIRST( key[checkpoint] )
// except for empty.
const int checkIndex = 0;
if (CanBeEmpty(key, checkIndex, checkLength, emptyDict)) {
VNode refKey = key[checkLength];// FIRST.MakeKey(first.Key[i]);
foreach (var Vt in refFirst.Values) {
if (!first.Contains(Vt)) { first.Add(Vt); changed = true; }
}
}
}
{
// 如果key的全部结点都可为empty,就说明FIRST( key ) 包含 empty。
// if key[0->(count-1)] can be empty,
// then FIRST( key ) includes empty.
if (CanBeEmpty(key, emptyDict)) {
if (!first.containsEmpty) {
first.containsEmpty = true; changed = true;
}
}
}
}
} while (changed);
}
// 计算文法的所有单个的结点的FIRST
private static void GetFIRSTDict4Node(
IReadOnlyList<RegulationDraft> regulations,
Dictionary<VNodesKey, FIRST> result, Dictionary<VNode, bool> emptyDict) {
// allocate space for all single nodes.
// 初始化FIRST(Vn)
var VnNodes = regulations.GetVnNodes();
foreach (var Vn in VnNodes) {
if (emptyDict[Vn]) {
var first = new FIRST(key: Vn, containsEmpty: true);
result.Add(first.key, first);
}
else {
var first = new FIRST(key: Vn);
result.Add(first.key, first);
}
}
// 初始化FIRST(Vt)(FIRST(Vt)实际上已经完工)
var VtNodes = regulations.GetVtNodes();
foreach (var Vt in VtNodes) {
var first = new FIRST(Vt, Vt);
result.Add(first.key, first);
}
bool changed = false;
do {
changed = false;
foreach (var regulation in regulations) {
var left = regulation.left;
var right = regulation.right;
// try to collect FIRST( left )
for (int checkpoint = 0; checkpoint < right.Count; checkpoint++) {
// 如果前checkpoint个结点都可为null,
// 就说明 FIRST(left) 包含 FIRST(right[checkpoint]),empty除外。
// if regulation.right[(-1)->(checkpoint-1)] can be empty,
// then FIRST( left ) includes FIRST( right[checkpoint] )
// except for empty.
if (CanBeEmpty(right, 0, checkpoint, emptyDict)) {
var refKey = right[checkpoint];
if (left != refKey) {
foreach (var Vt in refFirst.Values) {
if (!first.Contains(Vt)) { first.Add(Vt); changed = true; }
}
}
}
}
{
// if regulation.right can be empty,
// then regulation.left can be empty.
if (CanBeEmpty(right, emptyDict)) {
if (!first.containsEmpty) {
first.containsEmpty = true; changed = true;
}
}
}
}
} while (changed);
}
FOLLOW
结点Vt的FOLLOW集:在所有可能的产生式中,可能出现在Vt后面的所有结点。
计算文法的FOLLOW集
计算文法的FOLLOW集
/// returns the dictionary of FOLLOW.Vn -> FOLLOW
public static Dictionary<VNode/*FOLLOW.target*/, FOLLOW> GetFOLLOWDict(this IReadOnlyList<RegulationDraft> regulationDrafts,
Dictionary<VNode, bool> emptyDict, Dictionary<VNodesKey, FIRST> firstDict, VNode? eEnd = null) {
var result = new Dictionary<VNode/*FOLLOW.Vn*/, FOLLOW>();
// 初始化Follow Dict
// allocate space for the FOLLOW( Vn ) items.
var VnNodes = regulationDrafts.GetVnNodes();
foreach (var Vn in VnNodes) {
var follow = new FOLLOW(Vn);
result.Add(follow.Vn, follow);
}
if (eEnd is not null) { // add '¥' to S' : S ; '¥'
foreach (var follow in result) {
if (follow.Key == regulationDrafts[0].left) {
follow.Value.TryInsert(eEnd);
}
}
}
// 迭代到不动点
// iterate untill not changed.
bool changed = false;
do {
changed = false;
foreach (var regulationDraft in regulationDrafts) {
var right = regulationDraft.right; var count = right.Count;
for (var checkpoint = 0; checkpoint < count; checkpoint++) {
VNode target = right[checkpoint];
if (target.category == VNode.Category.Vt /*target.IsVt()*/) { continue; } // 叶结点没有follow
if (target.sourceCode == CompilerGrammar.emptykeyword) {
throw new Exception($"No '{CompilerGrammar.emptykeyword}' should appear in {nameof(RegulationDraft.right)}");
}
else {
// 准备为target添加follow元素
// try to collect FOLLOW( target )
var checkIndex = checkpoint + 1;
for (var checkCount = 0; checkCount < count - checkIndex; checkCount++) {
// if right[checkIndex->(checkIndex+checkCount-1)] can be empty,
// then FOLLOW( target ) includes FIRST( right[checkInde+checkCount] )
// except empty.
if (CanBeEmpty(right, checkIndex, checkCount, emptyDict)) {
// FOLLOW( target ) 包含 FIRST( right[checkInde+checkCount] )(除了empty)
VNode Vn = target;
if (!result.TryGetValue(Vn, out var follow)) { throw new Exception(Consts.algorithmError); }
VNode key = right[checkIndex + checkCount];
if (!firstDict.TryGetValue(new VNodesKey(key), out var first)) { throw new Exception(Consts.algorithmError); }
foreach (var Vt in first.Values) {
if (follow.TryInsert(Vt) is null) { changed = true; }
}
}
}
{
var checkCount = count - checkIndex;
// 如果target之后的全部结点都可为empty,那么 FOLLOW( target ) 包含 FOLLOW( regulation.left )
// if right[checkIndex->(count - checkIndex-1)] can be empty,
// then FOLLOW( target ) includes FOLLOW( regulation.left ).
if (CanBeEmpty(right, checkIndex, checkCount, emptyDict)) {
if (!result.TryGetValue(target, out var follow)) { throw new Exception(Consts.algorithmError); }
if (!result.TryGetValue(regulationDraft.left, out var refFollow)) { throw new Exception(Consts.algorithmError); }
if (follow != refFollow) {
foreach (var Vt in refFollow.Values) {
if (follow.TryInsert(Vt) is null) { changed = true; }
}
}
}
}
}
}
}
} while (changed);
return result;
}
从产生式到LL(1)语法分析表
用LL(1)分析法得到分析表
用LL(1)分析法得到分析表
// get LL1ParseTableDraft via LL(1) parse.
static LL1ParseTableDraft GetLL1SyntaxInfo(
IReadOnlyList<RegulationDraft> regulations, // 产生式
Dictionary<VNode, FOLLOW> eFOLLOWDict, Dictionary<VNodesKey, FIRST> eFIRSTDict) {
var regCount = regulations.Count;
var table = new LL1ParseTableDraft();
for (int regulationId = 0; regulationId < regCount; regulationId++) {
var regulation = regulations[regulationId];
var Vn = regulation.left;
var key = new VNodesKey(regulation.right);
var first = eFIRSTDict[key];
var firstCount = first.Values.Count;
for (int index = 0; index < firstCount; index++) {
var str = key.ToString();
var shortKey = str.Length > 8 ? str.Substring(0, 8) + ".." : str;
var Vt = first.Values[index];
table.SetAction(Vn, Vt, new LL1ParseActionDraft(regulationId));
}
if (first.containsEmpty) {
var follow = eFOLLOWDict[Vn];
foreach (var Vt in follow.Values) {
table.SetAction(Vn, Vt, new LL1ParseActionDraft(regulationId));
}
}
}
return table;
}
从产生式到LR(1)语法分析表
从产生式到LR(1)语法分析表
从产生式到LR(1)语法分析表
static CoupleList<VNode>? eEndCoupleList;
// 用LR分析法计算Edge、State、分析表LRTableDraft
static LRSyntaxInfo GetLRSyntaxInfo(
IReadOnlyList<RegulationDraft> regulations,
LRParseContext context) {
var beta2FIRST = new Dictionary<VNodesKey, FIRST>();
var nodeRegulationsDict = new Dictionary<VNode, IReadOnlyList<RegulationDraft>>();
var stateList = new CoupleList<LRState>(context.stateComparer);
var edgeList = new CoupleList<LREdge>(context.edgeComparer);
var eEnd = VNode.endOfTokenList;
var queue = new Queue<LRState>();
{
var firstState = new LRState(index: 0);
firstState.TryExpand(context, out var _, context.eRegulations[0], 0, new CoupleList<VNode>(eEnd));
Algo.Closure(firstState, context, beta2FIRST, nodeRegulationsDict);
stateList.TryInsert(firstState);
queue.Enqueue(firstState);
}
while (queue.Count > 0) {
var subject = queue.Dequeue();
var Vs = new List<VNode>();
foreach (var item in subject.ItemGroups) {
VNode? V = item.nodeNext2Dot;
if (V is not null && (!Vs.Contains(V))) {
Vs.Add(V);
var to = Algo.GoTo(subject, V, context);
Algo.Closure(to, context, beta2FIRST, nodeRegulationsDict);
var against = stateList.TryInsert(to);//融入组织之中吧
LRState? whichTo = null;
if (against is null) {
to.index = stateList.Count - 1; queue.Enqueue(to);
whichTo = to;
}
else {
whichTo = against;
}
var edge = new LREdge(subject, V, whichTo);
var against2 = edgeList.TryInsert(edge);
}
}
}
LRTableDraft table = GetLRTableDraft(stateList, edgeList, context.vtsProvider, context.eRegulations, eEnd);
var result = new LRSyntaxInfo(stateList, edgeList, table);
return result;
}
// LR的Closure操作。
// 补全一个状态。
private static void Closure(this LRState subject, LRParseContext context,
Dictionary<VNodesKey, FIRST> beta2FIRST, // cache for faster closure
Dictionary<VNode, IReadOnlyList<RegulationDraft>> nodeRegulationsDict/* cache for faster closure*/) {
var queue = new Queue<LRItemGroup>();
foreach (var itemGroup in subject.ItemGroups) {
VNode? node = itemGroup.nodeNext2Dot;
if (node is not null && node.category == VNode.Category.Vn) {
queue.Enqueue(itemGroup);
}
}
while (queue.Count > 0) {
var itemGroup = queue.Dequeue();
VNode? node = itemGroup.nodeNext2Dot;
if (!nodeRegulationsDict.TryGetValue(node, out var regulationDrafts)) {
regulationDrafts = context.eRegulations.GetRegulationDrafts(left: node);
nodeRegulationsDict.Add(node, regulationDrafts);
}
VNodesKey betaZ = itemGroup.betaZ;
if (!beta2FIRST.TryGetValue(betaZ, out var first)) {
first = GetFIRST(betaZ, context.eFIRSTDict, context.eNullableDict);
beta2FIRST.Add(betaZ, first);
}
IReadOnlyList<VNode>? lookAheads = first.Values;
foreach (var regulationDraft in regulationDrafts) {
const int dotPosition = 0;
var position = subject.TryExpand(context, regulationDraft, dotPosition, lookAheads);
VNode? node2 = position.nodeNext2Dot;
if (node2 is not null && node2.category == VNode.Category.Vn) {
queue.Enqueue(position);
}
}
}
}
/// <summary>
/// LR的Goto操作。
/// 将⏳移到所有LR项中的符号<paramref name="V"/>之后。
/// </summary>
/// <param name="subject"></param>
/// <param name="V">一个文法符号,终结点或非终结点。</param>
/// <returns></returns>
private static LRState GoTo(this LRState subject, VNode V) {
var toState = new LRState();
foreach (var itemGroup in subject.ItemGroups) {
VNode? next = itemGroup.nodeNext2Dot;
if (next is not null && next == V) {
toState.TryExpand(itemGroup.regulationDraft, itemGroup.dotPosition + 1, itemGroup.lookAheads);
}
}
return toState;
}
// organize stateList and edgeList into a table.
private static LRTableDraft GetLRTableDraft(CoupleList<LRState> stateList, CoupleList<LREdge> edgeList,
ILRTableVtsProvider VtsProvider, IReadOnlyList<RegulationDraft> eRegulations, VNode eEnd) {
var table = new LRTableDraft(stateList.Count);
foreach (var edge in edgeList) {
int stateId = edge.from.index;
int toId = edge.to.index;
switch (edge.V.category) {
case VNode.Category.Vn: // goto action
var gotoActionDraft = new LRParseActionDraft(LRParseActionDraft.EType.Goto, toId);
table.SetAction(stateId, edge.V, gotoActionDraft); break;
case VNode.Category.Vt: // shift in action
var shiftInActionDraft = new LRParseActionDraft(LRParseActionDraft.EType.ShiftIn, toId);
table.SetAction(stateId, edge.V, shiftInActionDraft); break;
default: throw new NotImplementedException();
}
}
var eLeft = eRegulations[0].left; // the S' in many books.
foreach (var state in stateList) {
int stateId = state.index;
foreach (var itemGroup in state.ItemGroups) {
if (itemGroup.nodeNext2Dot is null) {
if (itemGroup.regulationDraft.left == eLeft) {
// accept action
table.SetAction(stateId, eEnd, LRParseActionDraft.accept);
}
else {
// reduction action
int reductionIndex = Algo.IndexOf(eRegulations, itemGroup.regulationDraft);
var reductionActionDraft = new LRParseActionDraft(LRParseActionDraft.EType.Reduction, reductionIndex);
var Vts = VtsProvider.GetVts(itemGroup);
table.SetAction(stateId, Vts, reductionActionDraft);
}
}
}
}
return table;
}
从LR(1)语法分析表到LALR(1)语法分析表
既然已经有了LR(1)语法分析表,那么再计算LALR(1)语法分析表就有了简便方法。原因是:LALR(1)分析法与LR(1)分析法的区别仅在于:两个LR(1)的State里,某个项中的lookAhead不同时,那就意味着这是不同的State;而两个LALR(1)的State里,各个项中的lookAhead不同时,仍旧表示同一个State。也就是说,只需将LR(1)的各个State中,对应项相同而lookAhead不同的State合并,就得到了LALR(1)分析表的State。
LALR(1)简便算法
// 用LR(1)分析法得到的信息快速得到LALR(1)的Edge、State、分析表
static LRSyntaxInfo GetLALR1SyntaxInfo(LRSyntaxInfo _LRSyntaxInfo, // LR(1) syntax info
IReadOnlyList<RegulationDraft> eRegulations) {
var stateList = new CoupleList<LRState>();
// a LALR(1) state is 1/more LR(1) states.
// so, let's merge LR(1) states into LALR(1) state.
var LRState2LALR1State = new Dictionary<LRState, LRState>();
foreach (var state in _LRSyntaxInfo.stateList) {
var mentor = Absorb(stateList, state, context);
// 'state'(LR(1)) is merged into 'mentor'(LALR(1))
LRState2LALR1State.Add(state, mentor);
}
var edgeList = new CoupleList<LREdge>(_LRSyntaxInfo.edgeList.Count, context.edgeComparer);
foreach (var edge in _LRSyntaxInfo.edgeList) {
var from = LRState2LALR1State[edge.from];
var to = LRState2LALR1State[edge.to];
var edge2 = new LREdge(from, edge.V, to);
edgeList.TryInsert(edge2);
}
var eEnd = VNode.endOfTokenList;
LRTableDraft table = GetLRTableDraft(stateList, edgeList, context.vtsProvider, eRegulations, eEnd);
var result = new LRSyntaxInfo(stateList, edgeList, table);
return result;
}
// either some state in LALR1StateList absorbs/merges LR1State's lookAheads,
// or a new LALR(1) state which consists of LR1State is generated.
static LRState Absorb(CoupleList<LRState> LALR1StateList, LRState LR1State, LRParseContext context) {
var addList = LALR1StateList.addList; var orderList = LALR1StateList.orderList;
LRState? mentor = null; var index = orderList.Count;
int left = 0, right = orderList.Count - 1;
if (right < 0) {
mentor = new LRState(index);
mentor.index = index;
foreach (var itemGroup in LR1State.ItemGroups) {
mentor.TryExpand(itemGroup.regulationDraft, itemGroup.dotPosition, itemGroup.lookAheads);
}
addList.Add(mentor); orderList.Add(mentor);
}
else {
while (left < right) {
int mid = (left + right) / 2;
var current = orderList[mid];
var result = TryAbsorb(current, LR1State, context);// item.CompareTo(current);
if (result < 0) { right = mid; }
else if (result == 0) { left = mid; right = mid; mentor = current; }
else { left = mid + 1; }
}
if (mentor is null) {
var LALR1State = orderList[left];
var result = TryAbsorb(LALR1State, LR1State, context);// item.CompareTo(current);
if (result == 0) { /* already inserted into 'current' */ mentor = LALR1State; }
else {
mentor = new LRState(index);
foreach (var itemGroup in LR1State.ItemGroups) {
mentor.TryExpand(itemGroup.regulationDraft, itemGroup.dotPosition, itemGroup.lookAheads);
}
addList.Add(mentor); orderList.Insert(result < 0 ? left : left + 1, mentor);
}
}
}
return mentor;
}
// try to absorb state if this is equal state to state
// <para>only used in LALR(1) syntax parse.</para>
static int TryAbsorb(LRState LALR1State, LRState LR1State, LRParseContext context) {
var result = context.stateComparer(LALR1State, LR1State);
if (result == 0) { // euqal states should be absorbed.
foreach (var itemGroup in LR1State.ItemGroups) {
DoAbsorb(LALR1State.OrderList, itemGroup, context);
}
}
return result;
}
static void DoAbsorb(IReadOnlyList<LRItemGroup> list, LRItemGroup itemGroup, LRParseContext context) {
LRItemGroup? against = null; var keyIndex = -1;
var left = 0; var right = list.Count - 1;
if (right >= 0) {
var result = -1;
while (left < right) {
var mid = (left + right) / 2;
var current = list[mid];
result = context.itemGroupAbsorber(current, itemGroup.regulationDraft, itemGroup.dotPosition);
if (result < 0) { right = mid; }
else if (result == 0) { left = mid; right = mid; against = current; keyIndex = left; }
else { left = mid + 1; }
}
if (result != 0) {
var current = list[left];
result = context.itemGroupAbsorber(current, itemGroup.regulationDraft, itemGroup.dotPosition);
if (result == 0) { against = current; keyIndex = left; }
}
}
foreach (var lookAhead in itemGroup.lookAheads) {
against.lookAheads.TryInsert(lookAhead);
}
}
从LALR(1)语法分析表到语法分析器的C#代码
LALR(1)语法分析表记录的是在某个LALR(1)状态下,遇到某个V(Vn或Vt)时,应当跳转到哪个状态。在跳转过程中,有时应当用某个产生式规约,有时应当移进Vt,如果遇到'¥',则应当结束。
对应的语法分析器C#源代码,应当将LALR(1)语法分析表记录下来,这实际上是若干字典对象Dictionary<string/*V*/, LRParseAction>。
用LALR(1)分析表建立的语法分析器
partial class CompilerCalc {
const int syntaxStateCount = 16;
/// <summary>
/// LALR(1) syntax parsing table
/// </summary>
private static readonly SyntaxState[] syntaxStates = new SyntaxState[syntaxStateCount];
private static void InitializeSyntaxStates() {
var list = CompilerCalc.syntaxStates;
for (int i = 0; i < syntaxStateCount; i++) {
list[i] = new SyntaxState();
}
var aGoto2 = new LRParseAction(LRParseAction.EType.Goto, syntaxStates[2]);// reused 2 times
var aGoto3 = new LRParseAction(LRParseAction.EType.Goto, syntaxStates[3]);// reused 4 times
var aShiftIn4 = new LRParseAction(LRParseAction.EType.ShiftIn, syntaxStates[4]);// reused 6 times
var aShiftIn5 = new LRParseAction(LRParseAction.EType.ShiftIn, syntaxStates[5]);// reused 6 times
var aShiftIn6 = new LRParseAction(LRParseAction.EType.ShiftIn, syntaxStates[6]);// reused 2 times
var aShiftIn7 = new LRParseAction(LRParseAction.EType.ShiftIn, syntaxStates[7]);// reused 2 times
var aShiftIn8 = new LRParseAction(LRParseAction.EType.ShiftIn, syntaxStates[8]);// reused 3 times
var aShiftIn9 = new LRParseAction(LRParseAction.EType.ShiftIn, syntaxStates[9]);// reused 3 times
var aReduction2 = new LRParseAction(LRParseAction.EType.Reduction, regulations[2]);// reused 4 times
var aReduction5 = new LRParseAction(LRParseAction.EType.Reduction, regulations[5]);// reused 6 times
var aReduction7 = new LRParseAction(LRParseAction.EType.Reduction, regulations[7]);// reused 6 times
var aReduction0 = new LRParseAction(LRParseAction.EType.Reduction, regulations[0]);// reused 4 times
var aReduction1 = new LRParseAction(LRParseAction.EType.Reduction, regulations[1]);// reused 4 times
var aReduction3 = new LRParseAction(LRParseAction.EType.Reduction, regulations[3]);// reused 6 times
var aReduction4 = new LRParseAction(LRParseAction.EType.Reduction, regulations[4]);// reused 6 times
var aReduction6 = new LRParseAction(LRParseAction.EType.Reduction, regulations[6]);// reused 6 times
// 78 actions. 0 conflicts.
// syntaxStates[0]:
// [-1] FinalValue : ⏳ Additive ; '¥'
// [0] Additive : ⏳ Additive '+' Multiplicative ; '¥' '+' '-'
// [1] Additive : ⏳ Additive '-' Multiplicative ; '¥' '+' '-'
// [2] Additive : ⏳ Multiplicative ; '¥' '+' '-'
// [3] Multiplicative : ⏳ Multiplicative '*' Primary ; '¥' '+' '-' '*' '/'
// [4] Multiplicative : ⏳ Multiplicative '/' Primary ; '¥' '+' '-' '*' '/'
// [5] Multiplicative : ⏳ Primary ; '¥' '+' '-' '*' '/'
// [6] Primary : ⏳ '(' Additive ')' ; '¥' '+' '-' '*' '/'
// [7] Primary : ⏳ 'number' ; '¥' '+' '-' '*' '/'
list[0].actionDict.Add(EType.@Additive, new LRParseAction(LRParseAction.EType.Goto, syntaxStates[1]));/*Actions[0]*/
list[0].actionDict.Add(EType.@Multiplicative, aGoto2);/*Actions[1]*/
list[0].actionDict.Add(EType.@Primary, aGoto3);/*Actions[2]*/
list[0].actionDict.Add(EType.@LeftParenthesis符, aShiftIn4);/*Actions[3]*/
list[0].actionDict.Add(EType.@number, aShiftIn5);/*Actions[4]*/
// syntaxStates[1]:
// [-1] FinalValue : Additive ⏳ ; '¥'
// [0] Additive : Additive ⏳ '+' Multiplicative ; '¥' '+' '-'
// [1] Additive : Additive ⏳ '-' Multiplicative ; '¥' '+' '-'
list[1].actionDict.Add(EType.@Plus符, aShiftIn6);/*Actions[5]*/
list[1].actionDict.Add(EType.@Dash符, aShiftIn7);/*Actions[6]*/
list[1].actionDict.Add(EType.@终, LRParseAction.accept);/*Actions[7]*/
// syntaxStates[2]:
// [2] Additive : Multiplicative ⏳ ; '¥' '+' '-' ')'
// [3] Multiplicative : Multiplicative ⏳ '*' Primary ; '¥' '+' '-' '*' '/' ')'
// [4] Multiplicative : Multiplicative ⏳ '/' Primary ; '¥' '+' '-' '*' '/' ')'
list[2].actionDict.Add(EType.@Asterisk符, aShiftIn8);/*Actions[8]*/
list[2].actionDict.Add(EType.@Slash符, aShiftIn9);/*Actions[9]*/
list[2].actionDict.Add(EType.@终, aReduction2);/*Actions[10]*/
list[2].actionDict.Add(EType.@Plus符, aReduction2);/*Actions[11]*/
list[2].actionDict.Add(EType.@Dash符, aReduction2);/*Actions[12]*/
list[2].actionDict.Add(EType.@RightParenthesis符, aReduction2);/*Actions[13]*/
// syntaxStates[3]:
// [5] Multiplicative : Primary ⏳ ; '¥' '+' '-' '*' '/' ')'
list[3].actionDict.Add(EType.@终, aReduction5);/*Actions[14]*/
list[3].actionDict.Add(EType.@Plus符, aReduction5);/*Actions[15]*/
list[3].actionDict.Add(EType.@Dash符, aReduction5);/*Actions[16]*/
list[3].actionDict.Add(EType.@Asterisk符, aReduction5);/*Actions[17]*/
list[3].actionDict.Add(EType.@Slash符, aReduction5);/*Actions[18]*/
list[3].actionDict.Add(EType.@RightParenthesis符, aReduction5);/*Actions[19]*/
// syntaxStates[4]:
// [6] Primary : '(' ⏳ Additive ')' ; '¥' '+' '-' '*' '/' ')'
// [0] Additive : ⏳ Additive '+' Multiplicative ; ')' '+' '-'
// [1] Additive : ⏳ Additive '-' Multiplicative ; ')' '+' '-'
// [2] Additive : ⏳ Multiplicative ; ')' '+' '-'
// [3] Multiplicative : ⏳ Multiplicative '*' Primary ; ')' '+' '-' '*' '/'
// [4] Multiplicative : ⏳ Multiplicative '/' Primary ; ')' '+' '-' '*' '/'
// [5] Multiplicative : ⏳ Primary ; ')' '+' '-' '*' '/'
// [6] Primary : ⏳ '(' Additive ')' ; ')' '+' '-' '*' '/'
// [7] Primary : ⏳ 'number' ; ')' '+' '-' '*' '/'
list[4].actionDict.Add(EType.@Additive, new LRParseAction(LRParseAction.EType.Goto, syntaxStates[10]));/*Actions[20]*/
list[4].actionDict.Add(EType.@Multiplicative, aGoto2);/*Actions[21]*/
list[4].actionDict.Add(EType.@Primary, aGoto3);/*Actions[22]*/
list[4].actionDict.Add(EType.@LeftParenthesis符, aShiftIn4);/*Actions[23]*/
list[4].actionDict.Add(EType.@number, aShiftIn5);/*Actions[24]*/
// syntaxStates[5]:
// [7] Primary : 'number' ⏳ ; '¥' '+' '-' '*' '/' ')'
list[5].actionDict.Add(EType.@终, aReduction7);/*Actions[25]*/
list[5].actionDict.Add(EType.@Plus符, aReduction7);/*Actions[26]*/
list[5].actionDict.Add(EType.@Dash符, aReduction7);/*Actions[27]*/
list[5].actionDict.Add(EType.@Asterisk符, aReduction7);/*Actions[28]*/
list[5].actionDict.Add(EType.@Slash符, aReduction7);/*Actions[29]*/
list[5].actionDict.Add(EType.@RightParenthesis符, aReduction7);/*Actions[30]*/
// syntaxStates[6]:
// [0] Additive : Additive '+' ⏳ Multiplicative ; '¥' '+' '-' ')'
// [3] Multiplicative : ⏳ Multiplicative '*' Primary ; '¥' '+' '-' '*' '/' ')'
// [4] Multiplicative : ⏳ Multiplicative '/' Primary ; '¥' '+' '-' '*' '/' ')'
// [5] Multiplicative : ⏳ Primary ; '¥' '+' '-' '*' '/' ')'
// [6] Primary : ⏳ '(' Additive ')' ; '¥' '+' '-' '*' '/' ')'
// [7] Primary : ⏳ 'number' ; '¥' '+' '-' '*' '/' ')'
list[6].actionDict.Add(EType.@Multiplicative, new LRParseAction(LRParseAction.EType.Goto, syntaxStates[11]));/*Actions[31]*/
list[6].actionDict.Add(EType.@Primary, aGoto3);/*Actions[32]*/
list[6].actionDict.Add(EType.@LeftParenthesis符, aShiftIn4);/*Actions[33]*/
list[6].actionDict.Add(EType.@number, aShiftIn5);/*Actions[34]*/
// syntaxStates[7]:
// [1] Additive : Additive '-' ⏳ Multiplicative ; '¥' '+' '-' ')'
// [3] Multiplicative : ⏳ Multiplicative '*' Primary ; '¥' '+' '-' '*' '/' ')'
// [4] Multiplicative : ⏳ Multiplicative '/' Primary ; '¥' '+' '-' '*' '/' ')'
// [5] Multiplicative : ⏳ Primary ; '¥' '+' '-' '*' '/' ')'
// [6] Primary : ⏳ '(' Additive ')' ; '¥' '+' '-' '*' '/' ')'
// [7] Primary : ⏳ 'number' ; '¥' '+' '-' '*' '/' ')'
list[7].actionDict.Add(EType.@Multiplicative, new LRParseAction(LRParseAction.EType.Goto, syntaxStates[12]));/*Actions[35]*/
list[7].actionDict.Add(EType.@Primary, aGoto3);/*Actions[36]*/
list[7].actionDict.Add(EType.@LeftParenthesis符, aShiftIn4);/*Actions[37]*/
list[7].actionDict.Add(EType.@number, aShiftIn5);/*Actions[38]*/
// syntaxStates[8]:
// [3] Multiplicative : Multiplicative '*' ⏳ Primary ; '¥' '+' '-' '*' '/' ')'
// [6] Primary : ⏳ '(' Additive ')' ; '¥' '+' '-' '*' '/' ')'
// [7] Primary : ⏳ 'number' ; '¥' '+' '-' '*' '/' ')'
list[8].actionDict.Add(EType.@Primary, new LRParseAction(LRParseAction.EType.Goto, syntaxStates[13]));/*Actions[39]*/
list[8].actionDict.Add(EType.@LeftParenthesis符, aShiftIn4);/*Actions[40]*/
list[8].actionDict.Add(EType.@number, aShiftIn5);/*Actions[41]*/
// syntaxStates[9]:
// [4] Multiplicative : Multiplicative '/' ⏳ Primary ; '¥' '+' '-' '*' '/' ')'
// [6] Primary : ⏳ '(' Additive ')' ; '¥' '+' '-' '*' '/' ')'
// [7] Primary : ⏳ 'number' ; '¥' '+' '-' '*' '/' ')'
list[9].actionDict.Add(EType.@Primary, new LRParseAction(LRParseAction.EType.Goto, syntaxStates[14]));/*Actions[42]*/
list[9].actionDict.Add(EType.@LeftParenthesis符, aShiftIn4);/*Actions[43]*/
list[9].actionDict.Add(EType.@number, aShiftIn5);/*Actions[44]*/
// syntaxStates[10]:
// [6] Primary : '(' Additive ⏳ ')' ; '¥' '+' '-' '*' '/' ')'
// [0] Additive : Additive ⏳ '+' Multiplicative ; ')' '+' '-'
// [1] Additive : Additive ⏳ '-' Multiplicative ; ')' '+' '-'
list[10].actionDict.Add(EType.@RightParenthesis符, new LRParseAction(LRParseAction.EType.ShiftIn, syntaxStates[15]));/*Actions[45]*/
list[10].actionDict.Add(EType.@Plus符, aShiftIn6);/*Actions[46]*/
list[10].actionDict.Add(EType.@Dash符, aShiftIn7);/*Actions[47]*/
// syntaxStates[11]:
// [0] Additive : Additive '+' Multiplicative ⏳ ; '¥' '+' '-' ')'
// [3] Multiplicative : Multiplicative ⏳ '*' Primary ; '¥' '+' '-' '*' '/' ')'
// [4] Multiplicative : Multiplicative ⏳ '/' Primary ; '¥' '+' '-' '*' '/' ')'
list[11].actionDict.Add(EType.@Asterisk符, aShiftIn8);/*Actions[48]*/
list[11].actionDict.Add(EType.@Slash符, aShiftIn9);/*Actions[49]*/
list[11].actionDict.Add(EType.@终, aReduction0);/*Actions[50]*/
list[11].actionDict.Add(EType.@Plus符, aReduction0);/*Actions[51]*/
list[11].actionDict.Add(EType.@Dash符, aReduction0);/*Actions[52]*/
list[11].actionDict.Add(EType.@RightParenthesis符, aReduction0);/*Actions[53]*/
// syntaxStates[12]:
// [1] Additive : Additive '-' Multiplicative ⏳ ; '¥' '+' '-' ')'
// [3] Multiplicative : Multiplicative ⏳ '*' Primary ; '¥' '+' '-' '*' '/' ')'
// [4] Multiplicative : Multiplicative ⏳ '/' Primary ; '¥' '+' '-' '*' '/' ')'
list[12].actionDict.Add(EType.@Asterisk符, aShiftIn8);/*Actions[54]*/
list[12].actionDict.Add(EType.@Slash符, aShiftIn9);/*Actions[55]*/
list[12].actionDict.Add(EType.@终, aReduction1);/*Actions[56]*/
list[12].actionDict.Add(EType.@Plus符, aReduction1);/*Actions[57]*/
list[12].actionDict.Add(EType.@Dash符, aReduction1);/*Actions[58]*/
list[12].actionDict.Add(EType.@RightParenthesis符, aReduction1);/*Actions[59]*/
// syntaxStates[13]:
// [3] Multiplicative : Multiplicative '*' Primary ⏳ ; '¥' '+' '-' '*' '/' ')'
list[13].actionDict.Add(EType.@终, aReduction3);/*Actions[60]*/
list[13].actionDict.Add(EType.@Plus符, aReduction3);/*Actions[61]*/
list[13].actionDict.Add(EType.@Dash符, aReduction3);/*Actions[62]*/
list[13].actionDict.Add(EType.@Asterisk符, aReduction3);/*Actions[63]*/
list[13].actionDict.Add(EType.@Slash符, aReduction3);/*Actions[64]*/
list[13].actionDict.Add(EType.@RightParenthesis符, aReduction3);/*Actions[65]*/
// syntaxStates[14]:
// [4] Multiplicative : Multiplicative '/' Primary ⏳ ; '¥' '+' '-' '*' '/' ')'
list[14].actionDict.Add(EType.@终, aReduction4);/*Actions[66]*/
list[14].actionDict.Add(EType.@Plus符, aReduction4);/*Actions[67]*/
list[14].actionDict.Add(EType.@Dash符, aReduction4);/*Actions[68]*/
list[14].actionDict.Add(EType.@Asterisk符, aReduction4);/*Actions[69]*/
list[14].actionDict.Add(EType.@Slash符, aReduction4);/*Actions[70]*/
list[14].actionDict.Add(EType.@RightParenthesis符, aReduction4);/*Actions[71]*/
// syntaxStates[15]:
// [6] Primary : '(' Additive ')' ⏳ ; '¥' '+' '-' '*' '/' ')'
list[15].actionDict.Add(EType.@终, aReduction6);/*Actions[72]*/
list[15].actionDict.Add(EType.@Plus符, aReduction6);/*Actions[73]*/
list[15].actionDict.Add(EType.@Dash符, aReduction6);/*Actions[74]*/
list[15].actionDict.Add(EType.@Asterisk符, aReduction6);/*Actions[75]*/
list[15].actionDict.Add(EType.@Slash符, aReduction6);/*Actions[76]*/
list[15].actionDict.Add(EType.@RightParenthesis符, aReduction6);/*Actions[77]*/
}
}
编译原理中的词法分析器
根据文法Grammar生成词法分析器,其基本原理是:用正则表达式描述解析器中的单词Token,然后将其转换为词法分析器代码。
从正则表达式到ε-NFA
正则表达式是若干个“字符及其重复次数”的list。因此它也可以用一个文法表示。
Pattern.st
// Pattern is xxx in %%xxx%%
// xxx is any char between □(32) and ~(126).
#extractor <Pattern.st.ext>
syntax {
_Pattern : PreRegex Regex PostRegex ;
PreRegex : 'refVt' | empty ;
PostRegex : '/' Regex | empty ;
Regex : Regex '|' Bunch | Bunch ;
Bunch : Bunch Unit | Unit ;
Unit : 'char' Repeat | '.' Repeat | 'scope' Repeat | '(' Regex ')' Repeat ;
Repeat : '?' | '+' | '*' | 'MinMax' | empty ;
}
lexi {
'refVt' : %%refVt.%% ;
'char' : %%char.%% ;
'scope' : %%scope.%% ;
// {2} {2, } {2, 5}
'MinMax' : %%minmax.%% ;
}
mermaid {
0[[Pattern]] -->|"("| end11((#40;))
0 -->|")"| end12((#41;))
0 -->|"#42;"| end13((#42;))
...
0 -->|"other"| end6((char))
}
options {
'GrammarName' : Pattern ;
'ExtractedType' : Pattern ;
'blockComment' : off ;
'inlineComment' : off ;
}
当处理一个char及其次数时:
当处理一个char及其次数时
eNFAFragment Parse(char value, MinMax minmax) {
ICharRange conditionCode = SingleChar.New(value);
var count = minmax.max + 1; // 用count个eNFAState链接表示minmax.max次
if (count <= 0) { count = minmax.min + 1; }
var stateList = new eNFAState[count]; stateList[0] = new eNFAState();
for (int i = 1; i < count; i++) {
stateList[i] = new eNFAState();
var edge = eNFAEdge.Connect(stateList[i - 1], stateList[i], conditionCode);
}
for (int i = minmax.min; i < count - 1; i++) {
var edge = eNFAEdge.Connect(stateList[i], stateList[count - 1], null/*ε边*/);
}
if (minmax.max < 0) { // 如果最大为无限次
var edge = eNFAEdge.Connect(stateList[count - 1], stateList[count - 1], conditionCode);
}
var unit = new eNFAFragment(start: stateList[0], end: stateList[count - 1]);
return unit;
}
当处理其他类型的字符集合时,与此同理,不再赘述。
完整的词法分析器,需要处理Grammar中的所有Token类型,因此,我们必须把各个正则表达式拼合为一个大的正则表达式,如下:
拼合所有正则表达式
/// <summary>
/// get <see cref="AutomatonInfo"/> of the whole complete ε-NFA for lexical analyze
/// </summary>
/// <returns></returns>
public eNFAInfo GetWholeAutomaton() {
// Dictionary<Pattern, VNode/*Vt*/> pattern2Vt = this.GetPattern2Vt();
// prepare dict for faster search.
// <'Vt'> -> its tokenScript
var regexInfoDict = new Dictionary<VNode/*Vt*/, eNFAInfo>();
var wholeStart = new eNFAState(0, "wholeStart");
var wholeEnd = new eNFAState(1, "wholeEnd"); wholeEnd.isEnd = true;
var wholeeNFA = new eNFAInfo(wholeStart, wholeEnd);
// connect all eNFAInfo together to make a whole complete ε-NFA for lexical analyzing.
int VtId = 1;
var queue = new Queue<Pattern>(); foreach (var item in pattern2Vt) { queue.Enqueue(item.Key); }
var queue2 = new Queue<Pattern>();
while (queue.Count > 0) {
while (queue.Count > 0) {
var pattern = queue.Dequeue();
if (pattern.preVt == CompilerPattern.defaultpreVt) {
var Vt = pattern2Vt[pattern];
var regexInfo = pattern.regexInfo.Copy(VtId);
// connect preENFA & regex
var closeStart = eNFAEdge.Connect(wholeStart, regexInfo.start);
var closeEnd = eNFAEdge.Connect(regexInfo.end, wholeEnd);
regexInfoDict.Add(Vt, regexInfo);
VtId++;
}
else if (regexInfoDict.TryGetValue(pattern.preVt, out var preENFA)) {
var Vt = pattern2Vt[pattern];
var regexInfo = pattern.regexInfo.Copy(VtId);
// connect preENFA and tokenScript
var closeStart = eNFAEdge.Connect(preENFA.end, regexInfo.start);
var closeEnd = eNFAEdge.Connect(regexInfo.end, wholeEnd);
regexInfoDict.Add(Vt, regexInfo);
VtId++;
}
else {
queue2.Enqueue(pattern);
}
}
{
var tmp = queue; queue = queue2; queue2 = tmp;
}
}
return wholeeNFA;
}
从ε-NFA到补充完整的ε-NFA
这一步的作用是:将ε-NFA中所有隐藏的ε边都画出来,以便后续处理。
基本思路是:如果A-ε->B且B-ε->C,那么应当画出A-ε->C这条ε边。
从ε-NFA到补充完整的ε-NFA
// build edges that are implyed by ε edges.
private static eNFAInfo ManifesteNFA(eNFAInfo eNFA) {
var copyed = eNFA.Copy();
SpreadEnds(copyed);
CompleteEdges(copyed);
return copyed;
}
private static void SpreadEnds(eNFAInfo eNFA) {
var initialEnds = new Queue<eNFAState>();
{
var queue = new Queue<eNFAState>(); queue.Enqueue(eNFA.start);
var visited = new List<eNFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
if (subject.isEnd) {
if (!initialEnds.Contains(subject)) { initialEnds.Enqueue(subject); }
}
foreach (var edge in subject.toEdges) {
var to = edge.to;
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
}
// spread the ends
{
var queue = initialEnds;
var visited = new List<eNFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
foreach (var edge in subject.fromEdges) {
if (edge.conditionCode is null) {
var from = edge.from; from.isEnd = true;
if (!visited.Contains(from)) { queue.Enqueue(from); }
}
}
foreach (var edge in subject.toEdges) {
if (edge.conditionCode is null) {
var to = edge.to; to.isEnd = true;
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
}
}
}
// complete edges.
private static void CompleteEdges(eNFAInfo eNFA) {
var initialEmptyQueue = new Queue<eNFAEdge>();
{
var queue = new Queue<eNFAState>(); queue.Enqueue(eNFA.start);
var visited = new List<eNFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
foreach (var edge in subject.toEdges) {
if (edge.conditionCode is null) {
if (!initialEmptyQueue.Contains(edge)) { initialEmptyQueue.Enqueue(edge); }
}
var to = edge.to;
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
}
{
var emptyQueue = initialEmptyQueue;
var visited = new List<eNFAEdge>();
while (emptyQueue.Count > 0) {
var emptyEdge = emptyQueue.Dequeue();
if (!visited.Contains(emptyEdge)) {
visited.Add(emptyEdge);
var from = emptyEdge.from; var to = emptyEdge.to;
foreach (var edge in to.toEdges) {
var to2 = edge.to;
// if(from -->|emptyEdge| to) { from -->|to.toEdges| to2 }
var newEdge = eNFAEdge.Connect(from, to2, edge.conditionCode);
if (newEdge is not null) {
if (newEdge.conditionCode is null) { emptyQueue.Enqueue(newEdge); }
}
}
}
}
}
}
从补充完整的ε-NFA到NFA
这一步的作用是:去掉ε-NFA中的ε边(即无须任何char即可跳转过去的边),顺便去掉无用的状态State,以便后续处理。
基本思路是:复制原本的ε-NFA,但是,在复制过程中,如果边是ε边,则不复制它,也不复制它指向的State。
从补充完整的ε-NFA到NFA
// remove empty edges(and thus useless states).
private static NFAInfo ToNFA(eNFAInfo eNFAManifested) {
// Template.state -> Copyed.state
var stateDict = new Dictionary<eNFAState, NFAState>();
// Template.edge -> Copyed.edge
var edgeDict = new Dictionary<eNFAEdge, NFAEdge>();
NFAInfo NFA;
{
var tStart = eNFAManifested.start;
var cStart = new NFAState(tStart);
stateDict.Add(tStart, cStart);
NFA = new NFAInfo(cStart);
}
{
var tStart = eNFAManifested.start;
var queue = new Queue<eNFAState>(); queue.Enqueue(tStart);
var visited = new List<eNFAState>();
while (queue.Count > 0) {
var tSubject = queue.Dequeue();
if (!visited.Contains(tSubject)) {
visited.Add(tSubject);
// copy state
if (!stateDict.TryGetValue(tSubject, out var cSubject)) {
cSubject = new NFAState(tSubject); stateDict.Add(tSubject, cSubject);
}
bool allEmpty = true;
foreach (var tEdge in tSubject.toEdges) {
if (tEdge.conditionCode != null) {
allEmpty = false;
var tTo = tEdge.to;
// copy state
if (!stateDict.TryGetValue(tTo, out var cTo)) {
cTo = new NFAState(tTo);
stateDict.Add(tTo, cTo);
}
// copy edge
if (!edgeDict.TryGetValue(tEdge, out var cEdge)) {
cEdge = NFAEdge.Connect(cSubject, cTo, tEdge.conditionCode, tEdge.possibleVts);
edgeDict.Add(tEdge, cEdge);
}
if (!visited.Contains(tTo)) { queue.Enqueue(tTo); }
}
}
}
}
}
return NFA;
}
从NFA到DFA
NFA的一个缺点是:可能存在这样的情况,即A-x->B和A-x->C同时存在,即一个状态A可能在经过字符x时跳转到两个不同的状态B和C上。为了消灭这种情况,我们需要将NFA转换为等价的DFA。DFA就没有这种情况了。
基本思路是:若A-x->B和A-x->C同时存在,则将B和C视为一个整体,让这个整体作为DFA的一个状态。这样,A经过字符x时,就会只跳转到一个状态上了。
这被称为子集构造法(Subset Construction Algorithm)。
从NFA到DFA的子集构造法(Subset Construction Algorithm)
// 子集构造法
// transform from NFA to DFA.
private static DFAInfo ToDFA(NFAInfo NFA) {
int DFAId = 0;// id in order of DFA state creation
DFAInfo DFA;
var stateList = new CoupleList<DFAStateDraft>();
var edgeList = new CoupleList<DFAEdgeDraft>();
var queue = new Queue<DFAStateDraft>();
{
var DFAStart = new DFAStateDraft(DFAId++, NFA.start.name, NFA.start);
DFA = new DFAInfo(DFAStart, NFA);
stateList.TryInsert(DFAStart);
queue.Enqueue(DFAStart);
}
while (queue.Count > 0) { // find DFA states except the DFAStart
var from = queue.Dequeue();
// DFAToDict: { DFAFrom go through {some chars} to { DFATo } }
var chars = new List<char>();
foreach (var NFAState in from.NFAStates) {
foreach (var NFAEdge in NFAState.toEdges) {
foreach (var c in NFAEdge.GetChars()) {
if (!chars.Contains(c)) { chars.Add(c); }
}
}
}
// DFATo -> matching chars
var rawDict = new ListedDict<CoupleList<NFAEdgeDraft>, char>();
foreach (var c in chars) {
var NFAEdges = new CoupleList<NFAEdgeDraft>(); // -->|c| { toStates }
foreach (var NFAState in from.NFAStates) {
foreach (var NFAEdge in NFAState.toEdges) {
if (NFAEdge.Contains(c)) {
NFAEdges.TryInsert(NFAEdge);
}
}
}
rawDict.TryInsert(NFAEdges, c);
}
foreach (var item in rawDict) {
var NFAEdges = item.Key;
var to = new DFAStateDraft(DFAId, from NFAEdge in NFAEdges select NFAEdge.to);
if (stateList.TryInsert(to)) {
DFAId++;
var literalChars = item.Value;
string condition = ToCharRange(literalChars); }
var edge = DFAEdgeDraft.Connect(from, to, condition);
edgeList.TryInsert(edge);
queue.Enqueue(to);
}
else {
var t = stateList.IndexOf(to);
var oldTo = stateList[t];
var literalChars = item.Value;
string condition = ToCharRange(literalChars); }
var edge = DFAEdgeDraft.Connect(from, oldTo, condition);
edgeList.TryInsert(edge);
}
}
}
return DFA;
}
private static string/*DFAEdgeDraft.condition*/ ToCharRange(CoupleList<char> charList) {
if (charList.Count == 1) {
var c = charList[0];
return c.ToString();
}
var rangeItems = new List<RangeItem>();
var index = 0;
while (index < charList.Count) {
var min = charList[index]; var max = charList[index];
while (index < charList.Count - 1
&& Math.Abs(charList[index] - charList[index + 1]) == 1) {
if (charList[index + 1] < min) { min = charList[index + 1]; }
if (max < charList[index + 1]) { max = charList[index + 1]; }
index++;
}
rangeItems.Add(new RangeItem(min, max));
index++;
}
bool dashExists = false;
for (int t = 0; t < rangeItems.Count; t++) {
var rangeItem = rangeItems[t];
if (rangeItem.min == '-' && rangeItem.max == '-') {
rangeItems.Remove(rangeItem);
t--;
dashExists = true;
}
}
var b = new StringBuilder();
b.Append("["); if (dashExists) { b.Append("-"); }
foreach (var item in rangeItems) {
b.Append(item.ToCondition());
}
b.Append("]");
return b.ToString();
}
这一版本的效率太低,我已重写之。但这一版本更容易理解,因而放在这里。
从DFA到miniDFA
DFA的状态可能有很多,有时候是可以合并的。将其合并到数量最少的DFA,就是miniDFA。
基本思路是:
- 初始化:将代表一个Token的正则表达式的End状态分别独立划分出来,分别作为miniDFA的一个状态,将其他状态作为miniDFA的一个状态。
- 循环:拆分当前的各个miniDFA状态,方法是,如果这个miniDFA状态中的两个DFA状态不等价,则将其拆分到两个新的miniDFA状态中。何为不等价?miniDFA状态中的两个DFA状态A和B,若A和B对某个字符x,其跳转到不同的miniDFA状态,那么A和B就是不等价的。
- 收尾:无可拆分时,目前的状态就是全部miniDFA状态。最坏情况下,全部miniDFA状态与全部DFA状态的数量相同。
从DFA到miniDFA
// minimize states of the specified FAInfo.
private miniDFAInfo TominiDFA(DFAInfo DFAInfo) {
// every CoupleList<DFAState> is a/some potential miniDFA(s),
// and it needs to be further split.
List<CoupleList<DFAState>> chaos = InitChaos(DFAInfo);
List<CoupleList<DFAState>> completedChaos = SplitChaos(chaos, validChars);
// dump minimum DFA
miniDFAState[] miniDFAStates = ConstructminiDFAStates(completedChaos);
miniDFAInfo miniDFA;
{
miniDFAState? miniDFAStart = null;
foreach (var state in miniDFAStates) {
if (state.Contains(DFAInfo.start)) {
miniDFAStart = state; break;
}
}
miniDFA = new miniDFAInfo(miniDFAStart, DFAInfo);
}
// DFA state -> index of the item(which is a collection) in chaos
var DFA2Chaos = new Dictionary<DFAState, /*Collection<DFAState>*/int>();
{
for (int index = 0; index < completedChaos.Count; index++) {
var DFAStates = completedChaos[index];
foreach (var DFAState in DFAStates) { DFA2Chaos.Add(DFAState, /*DFAStates*/index); }
}
}
{
// edges of minimum DFA
var miniEdges = new CoupleList<miniDFAEdge>();
var queue = new Queue<DFAState>(); queue.Enqueue(DFAInfo.start);
var visited = new List<DFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
var fromIndex = DFA2Chaos[subject];
var miniDFAFrom = miniDFAStates[fromIndex];
foreach (var edge in subject.toEdges) {
var to = edge.to;
var toIndex = DFA2Chaos[to];
var newEdge = miniDFAEdge.Connect(miniDFAFrom, miniDFAStates[toIndex], edge.conditionCode);
miniEdges.TryInsert(newEdge);
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
}
return miniDFA;
}
private static miniDFAState[] ConstructminiDFAStates(List<CoupleList<DFAState>> completedChaos) {
var count = completedChaos.Count;
var miniDFAStates = new miniDFAState[count];
for (int id = 0; id < count; id++) {
var DFAStates = completedChaos[id];
miniDFAStates[id] = new miniDFAState(id, DFAStates);
}
return miniDFAStates;
}
private List<CoupleList<DFAState>> SplitChaos(List<CoupleList<DFAState>> initialChaos, ICharRange validChars) {
var currentChaos = initialChaos;
bool updated = true;
while (updated) {
var nextChaos = new List<CoupleList<DFAState>>();
foreach (var miniDFAEgg in currentChaos) {
var merged = new bool[miniDFAEgg.Count];
for (int i = 0; i < miniDFAEgg.Count; i++) {
if (merged[i]) { continue; }
var standard = miniDFAEgg[i];
var newEgg = new CoupleList<DFAState>(); newEgg.TryInsert(standard);
merged[i] = true;
for (int j = i + 1; j < miniDFAEgg.Count; j++) {
if (merged[j]) { continue; }
var state = miniDFAEgg[j];
if (EqualValue(standard, state, currentChaos, validChars)) {
newEgg.TryInsert(state);
merged[j] = true;
}
}
nextChaos.Add(newEgg);
}
}
updated = (nextChaos.Count != currentChaos.Count);
currentChaos = nextChaos;
}
return currentChaos;
}
// index -> condition code of the miniDFAEgg in the chaos.
private Dictionary<int/*which miniDFAEgg*/, ICharRange> GetHopcroft(DFAState key, List<CoupleList<DFAState>> chaos) {
var HopcroftBuilder = new Dictionary<int/*which miniDFAEgg*/, RangeListBuilder>();
foreach (var edge in key.toEdges) {
var found = false;
for (int sIndex = 0; sIndex < chaos.Count; sIndex++) {
var miniDFAEgg = chaos[sIndex];
foreach (var DFAState in miniDFAEgg) {
if (DFAState == edge.to) {
if (!HopcroftBuilder.TryGetValue(sIndex, out var builder)) {
builder = new RangeListBuilder();
HopcroftBuilder.Add(sIndex, builder);
}
builder.Append(edge.conditionCode);
found = true; break;
}
}
if (found) { break; }
}
}
Hopcroft = new Dictionary<int/*which miniDFAEgg*/, ICharRange>();
foreach (var item in HopcroftBuilder) {
var indexOfChaos = item.Key; var builder = item.Value;
var conditionCode = builder.Build();
Hopcroft.Add(indexOfChaos, conditionCode);
}
}
return Hopcroft;
}
// standard and current are in the same miniDFAEgg of chaos
// Are they of equal value?
private bool EqualValue(DFAState standard, DFAState current, List<CoupleList<DFAState>> chaos) {
var standardDict = this.GetHopcroft(standard, chaos);
var currentDict = this.GetHopcroft(current, chaos);
if (standardDict.Count != currentDict.Count) { return false; }
foreach (var item in standardDict) {
var indexOfChaos = item.Key;
if (!currentDict.TryGetValue(indexOfChaos, out var cConditionCode)) { return false; }
var sConditionCode = item.Value;
var sameRange = Algo.SameRange(sConditionCode, cConditionCode);
if (!sameRange) { return false; }
}
return true;
}
private static List<CoupleList<DFAState>> InitChaos(DFAInfo DFAInfo) {
var chaos = new List<CoupleList<DFAState>>();
var nonEnds = new CoupleList<DFAState>(DFAState.Comparer);
var queue = new Queue<DFAState>(); queue.Enqueue(DFAInfo.start);
var visited = new List<DFAState>();
while (queue.Count > 0) {
var subject = queue.Dequeue();
if (!visited.Contains(subject)) {
visited.Add(subject);
// just split every ends for now
if (subject.isEnd) {
var ends = new CoupleList<DFAState>(1); ends.TryInsert(subject);
chaos.Add(ends);
}
else { nonEnds.TryInsert(subject); }
foreach (var edge in subject.toEdges) {
var to = edge.to;
if (!visited.Contains(to)) { queue.Enqueue(to); }
}
}
}
if (nonEnds.Count > 0) { chaos.Insert(0, nonEnds); }
return chaos;
}
至此,从正则表达式到miniDFA的算法就完成了。词法分析器的构造并不复杂,但是需要十分细致耐心,相关资料又少又乱,耗费的开发时间反而比语法分析器部分多。
从DFA到词法分析器的C#代码
DFA与词法分析器的核心代码是一一对应的。即,一个DFA的状态就是词法分析器里的一个LexicalState lexicalStateA;字段,它包含一个匿名函数,此函数根据输入的字符x,跳转到下一个LexicalState lexicalStateB;字段。一个DFA的状态的每个跳出边都是一个相应的else if(){}分支,使其跳转到恰当的状态。词法分析器在不断地跳转过程中,收集信息,在合适的位置截断输入流string,并为其赋予相应的类型,使之成为一个Token对象。
Calc.st的第一个状态【lexicalState0.cs】
partial class CompilerCalc {
private static readonly LexicalState lexicalState0 = new LexicalState(
static (/* current char */ c, /* LexicalContext */ context) => {
if (false) { /* for simpler code generation purpose. */ }
/* user-input condition code */
/* \) */
else if (/* possible Vt : ')' */
/* single char */
c == ')'/*'\u0029'(41)*/) {
BeginToken(context, EType.@RightParenthesis符);
ExtendToken(context);
return lexicalState3;
}
/* user-input condition code */
/* \( */
else if (/* possible Vt : '(' */
/* single char */
c == '('/*'\u0028'(40)*/) {
BeginToken(context, EType.@LeftParenthesis符);
ExtendToken(context);
return lexicalState4;
}
/* user-input condition code */
/* \* */
else if (/* possible Vt : '*' */
/* single char */
c == '*'/*'\u002A'(42)*/) {
BeginToken(context, EType.@Asterisk符);
ExtendToken(context);
return lexicalState5;
}
/* user-input condition code */
/* - */
else if (/* possible Vt : '-' */
/* single char */
c == '-'/*'\u002D'(45)*/) {
BeginToken(context, EType.@Dash符);
ExtendToken(context);
return lexicalState6;
}
/* user-input condition code */
/* \+ */
else if (/* possible Vt : '+' */
/* single char */
c == '+'/*'\u002B'(43)*/) {
BeginToken(context, EType.@Plus符);
ExtendToken(context);
return lexicalState7;
}
/* user-input condition code */
/* [0-9] */
else if (/* possible Vt : 'number' */
/* [xxx] scope */
'0'/*'\u0030'(48)*/ <= c && c <= '9'/*'\u0039'(57)*/) {
BeginToken(context, EType.@number);
ExtendToken(context);
return lexicalState8;
}
/* user-input condition code */
/* \/ */
else if (/* possible Vt : '/' 'blockComment' 'inlineComment' */
/* 'blockComment' 'inlineComment' : comment ignores 'validScopeChars'([\u0000-\uFFFF]) and 'validGlobalChars'([\u0000-\uFFFF]) */
c == '/'/*'\u002F'(47)*/) {
BeginToken(context, EType.@Slash符, EType.@blockComment, EType.@inlineComment);
ExtendToken(context);
return lexicalState9;
}
/* accept everything else. */
else {
if (c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == '\0') { return lexicalState0; }
// default handler: unexpected char.
context.analyzingToken = new Token(context.Cursor, context.Line, context.Column);
context.result.Add(context.analyzingToken);
context.checkpoint = context.Cursor + 1;
context.analyzingToken.value = context.Substring(context.analyzingToken.index, context.checkpoint - context.analyzingToken.index);
context.analyzingToken.type = EType.Error;
context.result.errorDict.Add(context.analyzingToken, new TokenErrorInfo(context.analyzingToken, $"Unexpected char {c}"));
return lexicalState0;
}
});
}
由于这里仅仅是从DFA对象到C#源代码文件的转化,并不涉及编译原理相关算法,就不详述了。
编译原理中的语义分析器
语义分析器的基本思路是:后序优先遍历语法树,逐步得到用户所需的结果。
后序优先遍历算法
按后序优先遍历的顺序遍历语法树Node,才是按源代码中的字符顺序遍历源代码。在遍历时,对不同的结点执行不同的函数,就可以逐步得到语义结果。当然,这个“不同的函数”只能手工编写,不能自动生成了。
后序优先遍历语法树Node
/// <summary>
/// Extract some data structure from syntax tree.
/// <para>Post-order traversing.</para>
/// </summary>
/// <param name="rootNode">root node of the syntax tree.</param>
/// <param name="tokens">the token list correspond to <paramref name="rootNode"/>.</param>
/// <returns></returns>
public T? Extract(Node rootNode, TokenList tokens, string sourceCode) {
var context = new TContext<T>(rootNode, tokens, sourceCode);
// post-order traverse rootNode with stack(without recursion).
var nodeStack = new Stack<Node>(); var indexStack = new Stack<int>();
// init stack.
{
// push nextLeft and its next pending children.
var nextLeft = rootNode; var index = 0;
nodeStack.Push(nextLeft); indexStack.Push(index);
while (nextLeft.Children.Count > 0) {
nextLeft = nextLeft.Children[0];
nodeStack.Push(nextLeft); indexStack.Push(0);
}
}
while (nodeStack.Count > 0) {
var current = nodeStack.Pop(); var index = indexStack.Pop() + 1;
if (index < current.Children.Count) {
// push this node back again.
nodeStack.Push(current); indexStack.Push(index);
// push nextLeft and its next pending children.
var nextLeft = current.Children[index];
nodeStack.Push(nextLeft); indexStack.Push(0);
while (nextLeft.Children.Count > 0) {
nextLeft = nextLeft.Children[0];
nodeStack.Push(nextLeft); indexStack.Push(0);
}
}
else {
if (extractorDict.TryGetValue(current.type, out Action<Node, TContext<T>>? action)) {
action(current, context);
}
}
}
{
var current = this.endOfTokenList; // extra '¥' token indicates end of source code.
if (extractorDict.TryGetValue(current.type, out Action<Node, TContext<T>>? action)) {
action(current, context);
}
}
return context.result;
}
在C#中,我也将这些“不同的函数”做成了匿名函数的形式。因为确实不需要知道它们的名字。
Calc.st文法的语义分析器【获取计算结果】
partial class CompilerCalc {
/// <summary>
/// <see cref="Node.type"/> -> <see cref="Action{Node, TContext{FinalValue}}"/>
/// </summary>
private static readonly Dictionary<string/*Node.type*/, Action<Node, TContext<FinalValue>>> @finalValueExtractorDict = new();
private static readonly Action<Node, TContext<FinalValue>> VtHandler =
(node, context) => {
var token = context.tokens[node.tokenIndex];
context.objStack.Push(token);
};
/// <summary>
/// initialize dict for extractor.
/// </summary>
private static void InitializeFinalValueExtractorDict() {
var extractorDict = @finalValueExtractorDict;
// extractorDict.Add(EType.@Plus符, VtHandler);
// extractorDict.Add(EType.@Dash符, VtHandler);
// extractorDict.Add(EType.@Asterisk符, VtHandler);
// extractorDict.Add(EType.@Slash符, VtHandler);
// extractorDict.Add(EType.@LeftParenthesis符, VtHandler);
// extractorDict.Add(EType.@RightParenthesis符, VtHandler);
extractorDict.Add(EType.@number, VtHandler);
extractorDict.Add(EType.@终,
static (node, context) => {
// [-1]: FinalValue : Additive ;
var @finalValue = (double)context.objStack.Pop();
context.result = new FinalValue(@finalValue);
}); // end of extractorDict.Add(EType.@终, (node, context) => { ... });
extractorDict.Add(EType.@Additive,
static (node, context) => {
if (false) { /* for simpler code generation process */ }
else if (node.regulation == CompilerCalc.regulations[0]) {
// [0]: Additive : Additive '+' Multiplicative ;
var @multiplicative0 = (double)context.objStack.Pop();
var @additive2 = (double)context.objStack.Pop();
var value = additive2 + multiplicative0;
context.objStack.Push(value);
}
else if (node.regulation == CompilerCalc.regulations[1]) {
// [1]: Additive : Additive '-' Multiplicative ;
var @multiplicative0 = (double)context.objStack.Pop();
var @additive2 = (double)context.objStack.Pop();
var value = additive2 - multiplicative0;
context.objStack.Push(value);
}
else if (node.regulation == CompilerCalc.regulations[2]) {
// [2]: Additive : Multiplicative ;
}
else { throw new NotImplementedException(); }
}); // end of extractorDict.Add(EType.@Additive, (node, context) => { ... });
extractorDict.Add(EType.@Multiplicative,
static (node, context) => {
if (false) { /* for simpler code generation process */ }
else if (node.regulation == CompilerCalc.regulations[3]) {
// [3]: Multiplicative : Multiplicative '*' Primary ;
var @primary0 = (double)context.objStack.Pop();
var @multiplicative2 = (double)context.objStack.Pop();
var value = multiplicative2 * primary0;
context.objStack.Push(value);
}
else if (node.regulation == CompilerCalc.regulations[4]) {
// [4]: Multiplicative : Multiplicative '/' Primary ;
var @primary0 = (double)context.objStack.Pop();
var @multiplicative2 = (double)context.objStack.Pop();
var value = multiplicative2 / primary0;
context.objStack.Push(value);
}
else if (node.regulation == CompilerCalc.regulations[5]) {
// [5]: Multiplicative : Primary ;
}
else { throw new NotImplementedException(); }
}); // end of extractorDict.Add(EType.@Multiplicative, (node, context) => { ... });
extractorDict.Add(EType.@Primary,
static (node, context) => {
if (false) { /* for simpler code generation process */ }
else if (node.regulation == CompilerCalc.regulations[6]) {
// [6]: Primary : '(' Additive ')' ;
}
else if (node.regulation == CompilerCalc.regulations[7]) {
// [7]: Primary : 'number' ;
var @number0 = context.objStack.Pop() as Token;
var value = double.Parse(number0.value);
context.objStack.Push(value);
}
else { throw new NotImplementedException(); }
}); // end of extractorDict.Add(EType.@Primary, (node, context) => { ... });
}
}
End
标签:解析器,文法,list,Add,actionDict,EType,算法,var,new From: https://www.cnblogs.com/bitzhuwei/p/18544785