''' Purpose: web data acquisition -- print line-by-line the list of lines making up
        the contents of web file most-misspelled from
        http://www.cs.virginia.edu/~cs1112/datasets/words/
'''

# need help to get web data - so import the capability
from urllib.request import urlopen

# specify the link of interest
CS1112_WORDS_WEB_FOLDER = "http://www.cs.virginia.edu/~cs1112/datasets/words/"
FILE_NAME = "most-misspelled"

# get a link to file of interest
link = CS1112_WORDS_WEB_FOLDER + FILE_NAME

link = "http://www.cs.virginia.edu/~cs1112/datasets/words/most-misspelled"

# get a connection to stream the web resource of interest
stream = urlopen( link )

# read stream to get the contents of the page
content = stream.read()

# decode contents into plain text form
text = content.decode( "UTF-8" )

# get the lines that make up the text

lines = text.split() # This takes each line and splits them so that lines is a list of words (since each line was a word)

# So I have a question.
# What happens when you do text.split('\n')?

# '\n' is a newline character!

# When you pass in an argument into .split(), it will split it at that character and return a list of items separated
# at whatever you pass into .split().
# Otherwise, .split() will simply split at spaces and separate the strings at spaces by default. This is just like what
# we've been using .split() for the whole semester!

print( lines ) # Print lines will print out the list of words from our lines

# print the lines one by one

for line in lines:  # For each line in our list of lines (list of words)
    print( line ) # Print each line on a separate line