| Given the input... | we want... |
|---|---|
| <b>foo</b> | foo |
| <a href='foo.html'>foo</a> | foo |
| <a href="foo.html">foo</a> | foo |
| <a href=">">foo</a> | foo |
| <b>"foo"</b> | "foo" |
| "<b>foo</b>" | "foo" |
| <"b">foo</"b"> | foo |
States: no-tag / tag
#!/usr/bin/python3
def removeHtmlMarkup(s):
tag = False
out = ""
for c in s:
if c == '<': # Start of markup
tag = True
elif c == '>': # End of markup
tag = False
elif not tag:
out = out + c
return out
""" Which of these will fail??? """
if __name__ == "__main__":
print (removeHtmlMarkup('<b>foo</b>'))
print (removeHtmlMarkup('<em>foo</em>'))
print (removeHtmlMarkup('<a href="foo.html">foo</a>'))
print (removeHtmlMarkup('<a href="">foo</a>'))
print (removeHtmlMarkup('<a href=">">foo</a>'))
Which of these will fail?
<b>foo</b> <em>foo</em> <a href="foo.html">foo</a> <a href="">foo</a> <a href=">">foo</a>
States: no-tag,no-quote / tag,no-quote / tag,quote
#!/usr/bin/python3
###############################################################
### THIS CODE STILL CONTAINS A BUG - IF YOU SPOT IT, PLEASE ###
### DO NOT CALL OUT AND TELL EVERYONE ###
###############################################################
def removeHtmlMarkup(s):
tag = False
quote = False
out = ""
for c in s:
if c == '<' and not quote: # Start of markup MODIFIED
tag = True
elif c == '>' and not quote: # End of markup MODIFIED
tag = False
elif c == '"' or c == "'" and tag: # Quote NEW
quote = not quote # NEW
elif not tag:
out = out + c
return out
""" Which of these will fail??? """
if __name__ == "__main__":
# Old tests
print ("Old tests...")
print (removeHtmlMarkup('<b>foo</b>'))
print (removeHtmlMarkup('<em>foo</em>'))
print (removeHtmlMarkup('<a href="foo.html">foo</a>'))
print (removeHtmlMarkup('<a href="">foo</a>'))
print (removeHtmlMarkup('<a href=">">foo</a>'))
# New tests
print ("\nNew tests...")
print (removeHtmlMarkup('<b>foo</b>'), '\t\t[foo]')
print (removeHtmlMarkup('<b>"foo"</b>'), '\t\t["foo"]')
print (removeHtmlMarkup('"<b>foo</b>"'), '\t["foo"]')
print (removeHtmlMarkup('<"b">foo</"b">'), '\t\t[foo]')
Which of these still fails?
<b>foo</b> <b>"foo"</b> "<b>foo</b>" <"b">foo</"b">