""" Purpose: dataset nuance introduction """ # get access to web-based data support import url # set repository folder DATASET_FOLDER = "http://www.cs.virginia.edu/~cs1112/datasets/csv/" # set name of the dataset file_name = "best-sellers.csv" # specify link link = DATASET_FOLDER + file_name # get dataset table = url.get_dataset( link ) # get_dataset() parses a .csv file using the rules ( row elements separated by commas, rows separated by \n ) # to format our datasets pythonically as a list of lists # define and print dataset header header = table[ 0 ] # first row is going to be our header row for this dataset # the header is a list because table is a list of lists. the first item in the dataset is the first row, which is a list print( "header:", header ) print() # define and print dataset books books = table[ 1 : ] # the second row ( index = 1 ) on is our actual data in the dataset print( "books:", books ) print() # determine and print index of names, sales, and date columns of dataset sales_column = header.index( "Sales" ) name_column = header.index( "Name" ) date_column = header.index( "Date" ) # header is a list, so we use the index() function to grab the location of each label # those labels will correspond to our column numbers in our books dataset, which is a list of lists excluding the # header row # **header columns should align with your data columns otherwise you've got a bad dataset on your hands print( "sales column:", sales_column ) print( "name column:", name_column ) print( "date column:", date_column ) print() # determine total books solds among the best sellers of all time total = 0 # since we're accumulating a sum, we start it at 0 # accumulation does not magically go away after Test 1, we will continue to build on those topics for book in books : sold = book[ sales_column ] # we found the sales_column above using the index() function total = total + sold # since we're accumulating a sum, we say accum = accum + new print( "total sold:", total ) print() # build a list of the book publication dates dates = [] # list accumulator for row in books : # we're looping through books, which is a dataset excluding the header row. each run of this loop row is a different # list holding the information of a book. we use the date_column index found earlier to grab out the date and # add it to our list of dates year = row[ date_column ] dates.append( year ) # add to our accumulator using the append() function print( "dates:", dates ) # dates is a column from our dataset. it is the publication year, stored in each row in the position date_column print() # determine earliest and latest publication date earliest = min( dates ) latest = max( dates ) # min() and max() operate on dates, which is a list of numbers # why are the dates already numbers? get_dataset() handled the casting for us print( "earliest:", earliest ) print( "latest :", latest ) print() # determine average publication date date_total = sum( dates ) # <--------------------- HUH? # now that you've gotten through Test 1 we can use the built-in sum() function! yay nbr_of_dates = len( dates ) average_date = date_total // nbr_of_dates # arithmetic average using integer division returns the average year of these best sellers' publication print( "average date:", average_date ) print() # determine earliest and latest published books # to do so need to first find their indices into dates list, those # indices correspond to the row indices into books list row_earliest = dates.index( earliest ) row_latest = dates.index( latest ) # since we looped through the books dataset in order, the position of the earliest date will give you the row index # of the earliest book published. print( "row with earliest book:", row_earliest ) print( "row with latest book :", row_latest ) print() # use those indices to look at corresponding rows into books dataset earliest_row = books[ row_earliest ] # these are lists. we are grabbing 1 list out of the table books (list of lists) latest_row = books[ row_latest ] # print those rows print( "info on earliest:", earliest_row ) print( "info on latest: ", latest_row ) print() # print just the names of those books name_column = header.index( "Name" ) # ** index( element ) tells you the number index of where the element is in the list # ** find( ch ) tells you the number index of where ch is in a string earliest_name = earliest_row[ name_column ] # here we're grabbing out earlier cells. this is equivalent to latest_name = latest_row[ name_column ] # earliest_name = books[ earliest_row ][ name_column ] # we can use variables in the brackets to access the element as long as that variable is in range( 0, len( sequence ) ) # len( list_name ) hands back the number of elements in the list # len( string_name ) hands back the number of characters in the list # len( dataset_name ) hands back the number of row lists in the table print( "name of earliest:", earliest_name ) print( "name of latest: ", latest_name ) """ <---- will move down to reveal more of the program """