''' Purpose: practice getting data from a useful dataset ''' # define the base folder for course csv datasets CSV_WEB_FOLDER = 'http://www.cs.virginia.edu/~cs1112/datasets/csv/' # .csv is a file format standing for "comma-separated file" # these files are giant text files separating cells with commas and rows with \n import url # you need url.py in your CS 1112 file from now until the end of the course # we will add new things to it and have you download a new one. please listen and have this module in your folder # get name of the dataset reply = input( 'Enter name of dataset: ' ) print() # clean up the reply to get file name file_name = reply.strip() # we are trusting the user to type the dataset name in all lowercase # some file names have upper case letters, so using lower() could mess things up # get url link for dataset link = CSV_WEB_FOLDER + file_name # get the contents of the web file dataset = url.get_contents( link ) print( 'dataset ( using get_contents() -> big string )' ) print( dataset ) # notice that above dataset was acquired using the get_contents() function. dataset here is a giant string with the whole dataset print() reply = input( 'Enter when ready: ' ) # get dataset from the web dataset = url.get_dataset( link ) # now dataset will be a list of lists print( "dataset ( using get_dataset() -> list of lists )" ) # now it's going to print as a long list of lists all on one line. if we want to print line by line, we must do that in a loop # ** get_dataset() casts numbers into numbers for us. isn't that wonderful! print( dataset ) print() # divide dataset into header and data header = dataset[ 0 ] # the first row is our header row, telling us what all of our data means data = dataset[ 1 : ] # from the first row on is going to be the actual data in the function # slicing review: # [ i : j ] -> grab everything starting at position i and going up to j-1 # [ : j ] -> grab everything starting at position 0 and going up to j-1 # [ i : ] -> grab everything starting at position i and going to the end # print the header print( 'header:' ) print( header ) print() # print the dataset data print( 'data:' ) for row in data : print( row ) print() # sum of a numeric dataset: # for loop and accumulation # if you want the sum of a specific row i, summation = sum( dataset[ i ] ) # how complicated would it be to go to webpage and takes unstructured data into a dataset? hard # we work with .csv files because they already have a structure, and the code used to parse the files uses those structures