''' Purpose: web data acquisition -- print line-by-line the list of lines making up the contents of web file most-misspelled from http://www.cs.virginia.edu/~cs1112/datasets/words/ ''' # need help to get web data - so import the capability from urllib.request import urlopen # specify the link of interest CS1112_WORDS_WEB_FOLDER = "http://www.cs.virginia.edu/~cs1112/datasets/words/" FILE_NAME = "most-misspelled" # get a link to file of interest link = CS1112_WORDS_WEB_FOLDER + FILE_NAME link = "http://www.cs.virginia.edu/~cs1112/datasets/words/most-misspelled" # get a connection to stream the web resource of interest stream = urlopen( link ) # read stream to get the contents of the page content = stream.read() # decode contents into plain text form text = content.decode( "UTF-8" ) # get the lines that make up the text lines = text.split() # This takes each line and splits them so that lines is a list of words (since each line was a word) # So I have a question. # What happens when you do text.split('\n')? # '\n' is a newline character! # When you pass in an argument into .split(), it will split it at that character and return a list of items separated # at whatever you pass into .split(). # Otherwise, .split() will simply split at spaces and separate the strings at spaces by default. This is just like what # we've been using .split() for the whole semester! print( lines ) # Print lines will print out the list of words from our lines # print the lines one by one for line in lines: # For each line in our list of lines (list of words) print( line ) # Print each line on a separate line