''' Purpose: introduce web data acquisition -- print contents of the file word-of-the-day from web folder http://www.cs.virginia.edu/~cs1112/datasets/words/ ''' # need help to get web data - so import the capability from urllib.request import urlopen # urllib is a Python built in library # The word module or library basically means a collection of predefined functions and variables # that we can access and use. (Someone else wrote these libraries and they can be imported # and used!) # So basically urllib.request is where we want to get a function from and the function # we want to use is urlopen # urlopen is a function that can access the link (a website link) # IMPORTANT CONSTANTS CS1112_WORDS_WEB_FOLDER = 'http://www.cs.virginia.edu/~cs1112/datasets/words/' # ^ This link right here is where Professor Cohoon has a bunch of datasets (a folder) # We are getting into web acquisition and dataset manipulation. FILE_NAME = 'word-of-the-day' # get a link to file of interest link = CS1112_WORDS_WEB_FOLDER + FILE_NAME # So now this link ^ looks like this 'http://www.cs.virginia.edu/~cs1112/datasets/words/word-of-the-day' # Think about how in your computer you can go into folders and get specific files you want? # Same idea! We are just accessing an online web folder and going into the specific file # folder/file # get a connection to stream the web resource of interest stream = urlopen( link ) # read stream to gets its encoded contents page = stream.read() print( 'page=', page ) # Now we get the version of what's on the webpage as it looks like to the COMPUTER not to you # So the page looks like b'ineffable\n' # decode page into plain text form (UTF-8) - UTF-8 is basically our letters and numbers text = page.decode() # Take the page and make it readable to us as text (creates a string of what's on the webpage) # text looks like 'ineffable' # clean up text to get the word word = text.strip() # Strip the string of the text # print word of the day print( 'word of the day:', word )