Example code

Stripping HTML markup

Given the input...we want...
<b>foo</b>foo
<a href='foo.html'>foo</a>foo
<a href="foo.html">foo</a>foo
<a href=">">foo</a>foo
<b>"foo"</b>"foo"
"<b>foo</b>""foo"
<"b">foo</"b">foo

States: no-tag / tag

pic1
#!/usr/bin/python3

def removeHtmlMarkup(s):
    tag = False
    out = ""

    for c in s:
        if c == '<':   # Start of markup
            tag = True
        elif c == '>': # End of markup
            tag = False
        elif not tag:
            out = out + c

    return out

""" Which of these will fail??? """
if __name__ == "__main__":
    print (removeHtmlMarkup('<b>foo</b>'))
    print (removeHtmlMarkup('<em>foo</em>'))
    print (removeHtmlMarkup('<a href="foo.html">foo</a>'))
    print (removeHtmlMarkup('<a href="">foo</a>'))
    print (removeHtmlMarkup('<a href=">">foo</a>'))
    



[Download strip1.py]

Which of these will fail?

<b>foo</b>
<em>foo</em>
<a href="foo.html">foo</a>
<a href="">foo</a>
<a href=">">foo</a>

States: no-tag,no-quote / tag,no-quote / tag,quote

pic2
#!/usr/bin/python3

###############################################################
### THIS CODE STILL CONTAINS A BUG - IF YOU SPOT IT, PLEASE ###
### DO NOT CALL OUT AND TELL EVERYONE                       ###
###############################################################

def removeHtmlMarkup(s):
    tag   = False
    quote = False
    out   = ""

    for c in s:
        if c == '<' and not quote:          # Start of markup  MODIFIED
            tag = True
        elif c == '>' and not quote:        # End of markup    MODIFIED
            tag = False
        elif c == '"' or c == "'" and tag:  # Quote            NEW
            quote = not quote               #                  NEW
        elif not tag:
            out = out + c

    return out

""" Which of these will fail??? """
if __name__ == "__main__":
    # Old tests
    print ("Old tests...")
    print (removeHtmlMarkup('<b>foo</b>'))
    print (removeHtmlMarkup('<em>foo</em>'))
    print (removeHtmlMarkup('<a href="foo.html">foo</a>'))
    print (removeHtmlMarkup('<a href="">foo</a>'))
    print (removeHtmlMarkup('<a href=">">foo</a>'))

    # New tests
    print ("\nNew tests...")
    print (removeHtmlMarkup('<b>foo</b>'),     '\t\t[foo]')
    print (removeHtmlMarkup('<b>"foo"</b>'),   '\t\t["foo"]')
    print (removeHtmlMarkup('"<b>foo</b>"'),   '\t["foo"]')
    print (removeHtmlMarkup('<"b">foo</"b">'), '\t\t[foo]')

[Download strip2.py]

Which of these still fails?

<b>foo</b>
<b>"foo"</b>
"<b>foo</b>"
<"b">foo</"b">
Continue