. "heading": ["article"], "link": [], #Maybe only allow these under "link"? "target": [], "part": [], "trail": [], "extension": [], "template": [], "arg": [], "list": [], "listitem": [], "table": [], "tablerow": [], "tablecell": [], "bold": [], "italics": [], "sup": [], "sub": [], "preblock": [], "preline": [], } SELF_CLOSING_TAGS = { "space": None } def __init__(self, data): BeautifulSoup.BeautifulStoneSoup.__init__(self, data, convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES) # Set of plain text tags: we will extract text from inside these tags PLAIN_TAGS = set([ "bold", "italics", "sup", "sub", "preblock", "preline", "templatevar", # Used for some quote templates "part", ]) def extractLinkText(linkNode): """Extract text from a tag.""" assert linkNode.name == "link" try: if len(linkNode.contents) == 0: # return None first = linkNode.contents[0] if isinstance(first, BeautifulSoup.NavigableString): # text assert linkNode["type"] == "external" # External links could contain tags such as return "".join(extractText(linkNode)) assert first.name == "target" # can contain other tags, in particular