扫一扫
分享文章到微信
扫一扫
关注官方公众号
至顶头条
/*定义自动机当前的操作码*/
typedef enum{paChild = 0,paSibling = 1,paAttr = 2}XMLParseAction;
/*自动机器的数据结构*/
typedef struct tagXMLMac{
LINKPTR tree; /*二叉树用以维系节点关系和存储节点属性*/
LINKPTR parent; /*指向当前分析的父节点的,用于回溯*/
XMLParseAction act; /*当前自动机操作码*/
int retcode; /*自动机状态码*/
TCHAR* token; /*当前分析位置的字符串指针*/
}XMLMac;
/*定义一些节点的固有属性*/
#define NODENAME _T("NodeName")
#define NODETEXT _T("NodeText")
#define NODETYPE _T("NodeType")
/*定义一些常用的固定符号*/
#define XMLNS _T("xmlns:")
#define NSS _T('':'')
#define ASIGN _T(''='')
#define QUATE _T(''"'')
/*定义终止符号集合的停止符*/
#define NILL _T(''\x02'')
/*定义空格同意符*/
static TCHAR BlankSign[] = {_T('' ''),_T(''?''),_T(''\t''),''\r'',_T(''\n''),NILL};
/*定义节点文本终止符*/
static TCHAR TextTerm[] = {_T(''<''),_T(''\0''),NILL};
/*定义属性终止符*/
static TCHAR AttrTerm[] = {_T(''"''),/*_T(''>''),*/_T(''\0''),NILL};
/*定义节点首部终止符*/
static TCHAR TagHeadTerm[] = {_T('' ''),_T(''/''),_T(''>''),_T(''\t''),_T(''\r''),_T(''\n''),_T(''\0''),NILL};
/*定义节点尾部终止符*/
static TCHAR TagTailTerm[] = {_T(''>''),_T(''\0''),NILL};
/*定义一些节点类型*/
typedef enum{ttENT = 0,ttXML = 1,ttCMT = 2, ttELE = 3, ttCDA = 4, ttDOC = 5, ttEXT = 6, ttNOT = 7}TagType;
#define TT_ENT _T("<") /*define normal entity tag*/
#define TT_XML _T("<?xml") /*define xml root tag*/
#define TT_CMT _T("<!--") /*define comments tag*/
#define TT_ELE _T("<!ELEMENT") /*define data definition element tag*/
#define TT_CDA _T("<![CDATA[") /*define fregment data envelope*/
#define TT_DOC _T("<!DOCTYPE") /*define xml dtd source*/
#define TT_EXT _T("<!ENTITY") /*define outside entity*/
#define TT_NOT _T("<!NOTATION") /*define notation tag*/
3、定义解析过程实现:
/*测试字符是否是空格符*/
int _IsBlankSign(TCHAR ch)
{
int i = 0;
while(BlankSign[i] != NILL)
{
if(ch == BlankSign[i])
return 1;
i++;
}
return 0;
}
/*测试字符是否是节点首部终止符*/
int _IsTagHeadTerm(TCHAR ch)
{
int i = 0;
while(TagHeadTerm[i] != NILL)
{
if(ch == TagHeadTerm[i])
return 1;
i++;
}
return 0;
}
/*测试字符是否是节点尾部终止符*/
int _IsTagTailTerm(TCHAR ch)
{
int i = 0;
while(TagTailTerm[i] != NILL)
{
if(ch == TagTailTerm[i])
return 1;
i++;
}
return 0;
}
/*测试字符是否是节点文本终止符*/
int _IsTextTerm(TCHAR ch)
{
int i = 0;
while(TextTerm[i] != NILL)
{
if(ch == TextTerm[i])
return 1;
i++;
}
return 0;
}
/*测试字符是否是属性值终止符*/
int _IsAttrTerm(TCHAR ch)
{
int i = 0;
while(AttrTerm[i] != NILL)
{
if(ch == AttrTerm[i])
return 1;
i++;
}
return 0;
}
/*测试字符串首是否包括XML名域*/
int _IsNameSpace(TCHAR* key)
{
TCHAR* token = key;
int len;
len = _tcslen(XMLNS);
if(_tcsncpy(token,XMLNS,len) == 0)
return 1;
else
return 0;
}
/*测试节点类型*/
int _TagType(TCHAR* sz)
{
if(!_tcsncmp(sz,TT_XML,_tcslen(TT_XML)))
return ttXML;
else if(!_tcsncmp(sz,TT_CMT,_tcslen(TT_CMT)))
return ttCMT;
else if(!_tcsncmp(sz,TT_ELE,_tcslen(TT_ELE)))
return ttELE;
else if(!_tcsncmp(sz,TT_CDA,_tcslen(TT_CDA)))
return ttCDA;
else if(!_tcsncmp(sz,TT_DOC,_tcslen(TT_DOC)))
return ttDOC;
else if(!_tcsncmp(sz,TT_EXT,_tcslen(TT_EXT)))
return ttEXT;
else if(!_tcsncmp(sz,TT_NOT,_tcslen(TT_NOT)))
return ttNOT;
else if(!_tcsncmp(sz,TT_ENT,_tcslen(TT_ENT)))
return ttENT;
else
return -1;
}
/*越过空格符*/
TCHAR* _XMLSkipBlank(TCHAR* szXML)
{
TCHAR* token = szXML;
while(_IsBlankSign(*token))
token ++;
if(*token == _T(''\0''))
return NULL;
else
return token;
}
/*越过XML声明节,如 <?xml ...>*/
TCHAR* _XMLSkipXML(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_XML);
while(*token != _T(''>'') && *token != _T(''\0''))
token ++;
if(*token == _T(''>''))
return token + 1; /*skip ''<''*/
else
return token;
}
/*越过注释节,如 <!-- ... -->*/
TCHAR* _XMLSkipCMT(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_CMT);
while(*token != _T(''>'') && *token != _T(''\0''))
token ++;
if(*token == _T(''>''))
return token + 1; /*skip ''<''*/
else
return token;
}
/*越过实体声明节,如 <!ELEMENT ...>*/
TCHAR* _XMLSkipELE(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_ELE);
while(*token != _T(''>'') && *token != _T(''\0''))
token ++;
if(*token == _T(''>''))
return token + 1; /*skip ''<''*/
else
return token;
}
/*越过CDATA节,如 <!CDATA[[...]]>*/
TCHAR* _XMLSkipCDA(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_CDA);
while(*token != _T('']'') && *(token + 1) != _T('']'') && *token != _T(''\0''))
token ++;
if(*token == _T('']''))
return token + 2; /*skip '']]''*/
else
return token;
}
/*越过文档声明节,如 <!DOCTYPE ...>*/
TCHAR* _XMLSkipDOC(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_DOC);
while(*token != _T(''>'') && *token != _T(''\0''))
token ++;
if(*token == _T(''>''))
return token + 1; /*skip ''<''*/
else
return token;
}
/*越过外部实体声明节,如 <!ENTITY ...>*/
TCHAR* _XMLSkipEXT(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_EXT);
while(*token != _T(''>'') && *token != _T(''\0''))
token ++;
if(*token == _T(''>''))
return token + 1; /*skip ''<''*/
else
return token;
}
/*越过其他外部声明节,如 <!NOTATION ...>*/
TCHAR* _XMLSkipNOT(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_NOT);
while(*token != _T(''>'') && *token != _T(''\0''))
token ++;
if(*token == _T(''>''))
return token + 1; /*skip ''<''*/
else
return token;
}
/*越过赋值符,如 ... = "..."*/
TCHAR* _XMLSkipAsign(TCHAR* szXML)
{
TCHAR* token = szXML;
token = _XMLSkipBlank(token);
if(*token != ASIGN)
return NULL;
token ++;
return _XMLSkipBlank(token);
}
/*越过节点首部,如 <sometag> */
TCHAR* _XMLSkipTagHeader(TCHAR* szXML)
{
TCHAR* token = szXML + 1;
assert(*szXML == _T(''<''));
token = _XMLSkipBlank(token);
if(token == NULL || *token == _T(''>''))
return NULL;
while(!_IsTagHeadTerm(*token))
token ++;
if(*token == _T(''\0''))
return NULL;
return token;
}
/*越过节点尾部并回溯嵌套节点,如 </sometag1></sometag2>...*/
void _XMLSkipTagTail(XMLMac* pm)
{
TCHAR* token = pm->token;
token = _XMLSkipBlank(token);
if(token == NULL)
{
pm->retcode = XP_SUCCESS;
return;
}
if(*token == _T(''/'') || (*token == _T(''<'') && *(token + 1) == _T(''/'')))
{
while(!_IsTagTailTerm(*token))
token ++;
if(*token == _T(''\0''))
{
pm->retcode = XP_ERROR;
return;
}
token ++ ;//skip _T(''>'')
/*回溯到父节点*/
pm->parent = GetTreeDataParentItem(pm->parent);
pm->token = token;
pm->act = paSibling;
_XMLSkipTagTail(pm);
}else
{
pm->token = token;
}
}
/*分离名域和节点名称,如 ''xsl:entname''*/
void _SplitNameSpace(TCHAR* sz,TCHAR** ns,int* nslen,TCHAR** ent,int* entlen)
{
TCHAR* token = sz;
*ns = *ent = NULL;
*nslen = *entlen = 0;
while(!_IsTagHeadTerm(*token) && *token != NSS)
token ++;
if(*token == NSS)
{
*ns = sz;
*nslen = token - sz;
token ++; /*skip NSS '':''*/
*ent = token;
while(!_IsTagHeadTerm(*token))
{
token ++;
*entlen = *entlen + 1;
}
}else
{
*ent = sz;
*entlen = token - sz;
}
}
/*解析节点的属性集合*/
void _XMLParseAttr(XMLMac* pm)
{
TCHAR* token = pm->token;
int keylen,vallen;
TCHAR* key;
TCHAR* val;
key = pm->token;
while(!_IsBlankSign(*token) && *token != ASIGN)
token ++;
keylen = token - pm->token ;
token = _XMLSkipAsign(token);
if(token == NULL)
{
pm->retcode = XP_ERROR;
return;
}
if(*token == QUATE) /*skip left QUATE ''"''*/
token ++;
val = token;
while(!_IsAttrTerm(*token))
token ++;
vallen = token - val ;
WriteTreeDataItemProper(pm->parent,key,keylen,val,vallen);
if(*token == QUATE) /*skip right QUATE ''"''*/
token ++;
token = _XMLSkipBlank(token);
if(token == NULL)
{
pm->retcode = XP_ERROR;
return;
}
/*test entity property set is terminated*/
if(*token == _T(''>'') || *token == _T(''/''))
{
pm->retcode = XP_SUCCESS;
pm->token = token ;
return;
}
pm->token = token;
pm->retcode = XP_CONTINUE;
_XMLParseAttr(pm);
}
/*解析节点文本,如 <sometag>sometext</sometag>*/
void _XMLParseTagText(XMLMac* pm)
{
TCHAR* token = pm->token;
TCHAR* val;
int vallen;
token = _XMLSkipBlank(token);
if(token == NULL)
{
pm->retcode = XP_ERROR;
return;
}
val = token;
while(!_IsTextTerm(*token))
token ++;
if(*token == _T(''\0''))
{
pm->retcode = XP_ERROR;
return;
}
vallen = token - val;
WriteTreeDataItemProper(pm->parent,NODETEXT,-1,val,vallen);
pm->token = token;
pm->retcode = XP_SUCCESS;
}
/*解析节点和他的子节点*/
void _XMLParseTagEntity(XMLMac* pm)
{
TCHAR* token = pm->token;
TCHAR* tag;
TCHAR* ns;
TCHAR* ent;
int len,nslen,entlen;
int tt;
LINKPTR item;
token = _XMLSkipBlank(token);
if(token == NULL)
{
if(pm->parent == NULL)
pm->retcode = XP_SUCCESS; //no more to parse
else
pm->retcode = XP_ERROR ; //lost some tag
return;
}
tt = _TagType(token);
if(tt < 0)
{
pm->retcode = XP_ERROR; /*invalid entity header*/
return;
}
/*部分节点类型在此不作分析*/
if(tt == ttXML)
{
token = _XMLSkipXML(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttCMT)
{
token = _XMLSkipCMT(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttELE)
{
token = _XMLSkipELE(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttCDA)
{
token = _XMLSkipCDA(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttDOC)
{
token = _XMLSkipDOC(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttEXT)
{
token = _XMLSkipEXT(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttNOT)
{
token = _XMLSkipNOT(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}
/*开始分析节点*/
tag = token + 1; /*skip ''<''*/
token = _XMLSkipTagHeader(token);
if(token == NULL)
{
pm->retcode = XP_ERROR; /*invalid entry body*/
return;
}
len = token - tag;
item = InsertTreeDataItem(pm->tree,pm->parent,LINK_LAST);
/*分析节点名域和名称*/
_SplitNameSpace(tag,&ns,&nslen,&ent,&entlen);
if(nslen == 0)
WriteTreeDataItemProper(item,NODENAME,-1,ent,entlen);
else
{
WriteTreeDataItemProper(item,NODENAME,-1,ent,entlen);
WriteTreeDataItemProper(item,XMLNS,-1,ns,nslen);
}
/*新节点并作为当前父节点,接着去分析下一个节点*/
pm->parent = item;
token = _XMLSkipBlank(token);
if(token == NULL)
{
pm->retcode = XP_ERROR;
return;
}
if(*token != _T(''>'')) /*节点首部有属性集合*/
{
if(*token != _T(''/'')) /*非空节点,以下作节点属性集合分析*/
{
pm->act = paAttr;
pm->token = token;
_XMLParseAttr(pm);
if(pm->retcode == XP_ERROR)
return;
}else /*空节点*/
pm->token = token;
token = pm->token;
if(*token == _T(''/'')) /*空节点,则完成该节点分析*/
{
pm->token = token;
_XMLSkipTagTail(pm); /*完成空节点分析并回溯*/
if(pm->retcode == XP_ERROR)
{
return;
}
_XMLParseTagEntity(pm); /*分析下一个兄弟节点*/
return;
}else
token ++; /*skip _T(''>'')*/
}else
{
token ++; //skip _T(''>'')
pm->token = token;
}
/*分析节点文本*/
pm->token = token;
pm->retcode = XP_CONTINUE;
_XMLParseTagText(pm);
if(pm->retcode == XP_ERROR)
return;
token = pm->token;
assert(*token == _T(''<''));
if(*(token + 1) != _T(''/'')) /*该节点有子节点*/
{
pm->token = token;
pm->act = paChild;
pm->retcode = XP_CONTINUE;
_XMLParseTagEntity(pm); /*分析子节点*/
}else /*没有子节点*/
{
pm->token = token;
_XMLSkipTagTail(pm); /*完成该节点分析并回溯*/
if(pm->retcode == XP_ERROR)
{
return;
}
_XMLParseTagEntity(pm); /*去分析下一个兄弟节点*/
}
}
如果您非常迫切的想了解IT领域最新产品与技术信息,那么订阅至顶网技术邮件将是您的最佳途径之一。
现场直击|2021世界人工智能大会
直击5G创新地带,就在2021MWC上海
5G已至 转型当时——服务提供商如何把握转型的绝佳时机
寻找自己的Flag
华为开发者大会2020(Cloud)- 科技行者