''' Purpose: introduce web data acquisition -- print contents of word-of-the-day.txt from http://www.cs.virginia.edu/~cs1112/datasets/words/ ''' # need help to get web data - so import the capability from urllib.request import urlopen # We import urlopen so that we can use methods from the library # urlopen to work with getting information from the web! # When we import urlopen, we can use the functions and constants from # that library! We can use those functions in our code! # Today we are learning acquiring information from the web # and how to work with datasets. # IMPORTANT CONSTANTS CS1112_WORDS_WEB_FOLDER = "http://www.cs.virginia.edu/~cs1112/datasets/words/" # Usually our files for OUR datasets is in our datasets folder on the website^ FILE_NAME = "word-of-the-day" # this is the specific file we wanna # go to after going into the folder and then going into what SPECIFICALLY # we want from that folder # get a link to file of interest link = CS1112_WORDS_WEB_FOLDER + FILE_NAME # You have the full link! ... datasets/ dataset-name(file name) # Think about what urls/links look like in general with "/"s - same logic! print( "Link=", link) # get a connection to stream the web resource of interest stream = urlopen( link ) # Program's connection to the web print( "Stream= ", stream ) # read stream to gets its encoded contents encoding = stream.read() print( "Encoding=", encoding ) # decode contents into plain text form text = encoding.decode( 'UTF-8' ) # UTF-8 is a particular encoding # Basically makes your encoding into the traditional numerical/alphabetical # encoding - translates your encoding into plain characters (numbers/letters) # So it took your encoding and translated it into English we understand # clean up text to get the word word = text.strip() # We want to remove all leading and trailing whitespace # So what did we do? Read the webpage, transform it so we can # decode it, translate it into characters using UTF-8, and then print # out whatever was on that webpage # (the word ineffable in this case). So we got data from the web! Yay! # print word of the day print( word )