数据格式:
<PAIR> <TEXT>AAA</TEXT> <LINK>BBB</LINK> </PAIR> ...... <PAIR> <TEXT>XXX</TEXT> <LINK>YYY</LINK> </PAIR>
代码:特别注意re.DOCALL
def getAnchors(self): texts = [] links = [] astr = self._get("ANCHOR") #pattern for anchor pairpat = re.compile("(<PAIR>.*?</PAIR>)", re.DOTALL | re.MULTILINE) textpat = re.compile("<TEXT>(.*?)</TEXT>", re.DOTALL | re.MULTILINE) linkpat = re.compile("<LINK>(.*?)</LINK>", re.DOTALL | re.MULTILINE) #each <PAIR> for m1 in pairpat.finditer(astr): anchor = m1.group() #extract TEXT and LINK in each <PAIR> text = textpat.search(anchor).group(1).strip() if len(text) == 0: text = "EMPTY_ANCHOR_TEXT" link = linkpat.search(anchor).group(1).strip() texts.append(text) links.append(link) #return return texts, links pass