Clearly our script still has a bug.
| Given the input... | we expected... | but got |
|---|---|---|
| <b>"foo"</b> | "foo" | foo |
| "<b>foo</b>" | "foo" | <b>foo</b> |
We could scatter print statements in the code to see what is going on or just keep changing things until it work - this is not the way to do it!
#!/usr/bin/python3
def removeHtmlMarkup(s):
tag = False
quote = False
out = ""
for c in s:
print(c, tag, quote) # NEW
if c == '<' and not quote: # Start of markup
tag = True
elif c == '>' and not quote: # End of markup
tag = False
elif c == '"' or c == "'" and tag: # Quote
quote = not quote
elif not tag:
out = out + c
return out
""" We know these fail """
if __name__ == "__main__":
print (removeHtmlMarkup('"<b>foo</b>"'), '\t["foo"]')
# print (removeHtmlMarkup('<b>"foo"</b>'), '\t\t["foo"]')
Imagine dealing with 10,000 characters of input.
We know the problem has something to do with checking quotes, so why not just remove the check on the status of quote?
#!/usr/bin/python3
def removeHtmlMarkup(s):
tag = False
quote = False
out = ""
for c in s:
if c == '<': # Start of markup MODIFIED
tag = True
elif c == '>': # End of markup MODIFIED
tag = False
elif c == '"' or c == "'" and tag: # Quote
quote = not quote
elif not tag:
out = out + c
return out
""" We know these fail """
if __name__ == "__main__":
print (removeHtmlMarkup('"<b>foo</b>"'), '\t["foo"]')
print (removeHtmlMarkup('<b>"foo"</b>'), '\t["foo"]')
The quotes are still missing...
Why not just remove the check on quotes altogether?
#!/usr/bin/python3
def removeHtmlMarkup(s):
tag = False
quote = False
out = ""
for c in s:
if c == '<': # Start of markup
tag = True
elif c == '>': # End of markup
tag = False
# elif c == '"' or c == "'" and tag: # Quote REMOVED
# quote = not quote # REMOVED
elif not tag:
out = out + c
return out
""" Does it work? """
if __name__ == "__main__":
print (removeHtmlMarkup('"<b>foo</b>"'), '\t["foo"]')
print (removeHtmlMarkup('<b>"foo"</b>'), '\t["foo"]')
print (removeHtmlMarkup('<a href=">">foo</a>'), '\t[foo]')
The first two work! But do the old tests still work?
These examples of bad practice come from Steve McConnell's book Code Complete [Supporting Web Site]
Simply fix the special case that we know doesn't work...
#!/usr/bin/python3
def removeHtmlMarkup(s):
if s == '"<b>foo</b>"':
return '"foo"'
# if s == '<b>"foo"</b>':
# return '"foo"'
tag = False
quote = False
out = ""
for c in s:
if c == '<' and not quote: # Start of markup
tag = True
elif c == '>' and not quote: # End of markup
tag = False
elif c == '"' or c == "'" and tag: # Quote
quote = not quote
elif not tag:
out = out + c
return out
""" We know these failed """
if __name__ == "__main__":
print (removeHtmlMarkup('"<b>foo</b>"'), '\t["foo"]')
# print (removeHtmlMarkup('<b>"foo"</b>'), '\t["foo"]')