Given the input... | we want... |
---|---|
<b>foo</b> | foo |
<a href='foo.html'>foo</a> | foo |
<a href="foo.html">foo</a> | foo |
<a href=">">foo</a> | foo |
<b>"foo"</b> | "foo" |
"<b>foo</b>" | "foo" |
<"b">foo</"b"> | foo |
States: no-tag / tag
#!/usr/bin/python3 def removeHtmlMarkup(s): tag = False out = "" for c in s: if c == '<': # Start of markup tag = True elif c == '>': # End of markup tag = False elif not tag: out = out + c return out """ Which of these will fail??? """ if __name__ == "__main__": print (removeHtmlMarkup('<b>foo</b>')) print (removeHtmlMarkup('<em>foo</em>')) print (removeHtmlMarkup('<a href="foo.html">foo</a>')) print (removeHtmlMarkup('<a href="">foo</a>')) print (removeHtmlMarkup('<a href=">">foo</a>'))
Which of these will fail?
<b>foo</b> <em>foo</em> <a href="foo.html">foo</a> <a href="">foo</a> <a href=">">foo</a>
States: no-tag,no-quote / tag,no-quote / tag,quote
#!/usr/bin/python3 ############################################################### ### THIS CODE STILL CONTAINS A BUG - IF YOU SPOT IT, PLEASE ### ### DO NOT CALL OUT AND TELL EVERYONE ### ############################################################### def removeHtmlMarkup(s): tag = False quote = False out = "" for c in s: if c == '<' and not quote: # Start of markup MODIFIED tag = True elif c == '>' and not quote: # End of markup MODIFIED tag = False elif c == '"' or c == "'" and tag: # Quote NEW quote = not quote # NEW elif not tag: out = out + c return out """ Which of these will fail??? """ if __name__ == "__main__": # Old tests print ("Old tests...") print (removeHtmlMarkup('<b>foo</b>')) print (removeHtmlMarkup('<em>foo</em>')) print (removeHtmlMarkup('<a href="foo.html">foo</a>')) print (removeHtmlMarkup('<a href="">foo</a>')) print (removeHtmlMarkup('<a href=">">foo</a>')) # New tests print ("\nNew tests...") print (removeHtmlMarkup('<b>foo</b>'), '\t\t[foo]') print (removeHtmlMarkup('<b>"foo"</b>'), '\t\t["foo"]') print (removeHtmlMarkup('"<b>foo</b>"'), '\t["foo"]') print (removeHtmlMarkup('<"b">foo</"b">'), '\t\t[foo]')
Which of these still fails?
<b>foo</b> <b>"foo"</b> "<b>foo</b>" <"b">foo</"b">