''' Purpose: dataset nuance introduction ''' # define and print dataset header header = [ "Name", "Author", "Language", "Date", "Sales" ] # header describes dataset; contains column names # from a web file, it is the first row (index 0); assuming that we know this #
is a variable that we assign the first row of a dataset; it is not a keyword or predefined in any way print( "header:", header ) print() # determine and print index of names, sales, and date columns of dataset # find where each column name is; assuming that we already know what the column names are # index() function returns the index place in the
list of what's inside the () sales_column = header.index( 'Sales' ) # sales_column = 4 name_column = header.index( 'Name' ) # name_column = 0 date_column = header.index( 'Date' ) # date_column = 3 print( 'sales column:', sales_column ) print( 'name column:', name_column ) print( 'date column:', date_column ) print() # define dataset called (made up) because it contains some "book" information # books = list of lists # books itself is a list [] --> contains lists in it; every list is denoted within [] --> hence [ [], [], ... ] books = [ [ "Alice's Adventures in Wonderland", "Carroll", "English", 1865, 100000000 ], [ "And Then There Were None", "Christie", "English", 1939, 100000000 ], [ "Dream of the Red Chamber", "Xueqin", "Chinese", 1754, 100000000 ], [ "Don Quixote", "de Cervantes", "Spanish", 1605, 500000000 ], [ "Harry Potter", "Rowling", "English", 1997, 447000000 ], [ "The Hobbit", "Tolkien", "English", 1937, 150000000 ], [ "The Little Prince", "de Saint-Exupery", "French", 1943, 150000000 ], [ "The Lord of the Rings", "Tolkien", "English", 1954, 150000000 ], [ "A Tale of Two Cities", "Dickens", "English", 1859, 200000000 ], ] #end of list # formatted like this for easier readability # white spaces do not matter # books[r] = a row in the books list; or another sublist # books[0] = [ "Alice's Adventures in Wonderland", "Carroll", "English", 1865, 100000000 ] # books[r][c] = each column in the sublist/row books[r] # if r = 0 and c = 0 # books[0][0] = "Alice's Adventures in Wonderland" # if r = 0 and c = 1 # books[0][1] = "Carroll" # if r = 1 and c = 1 # books[1][1] = "Christie" print( "books:", books ) # printing list of lists is ugly print() # print the rows of the dataset for row in books : print( 'row:', row ) # is a variable we assigned to each sublist inside the list of lists # a sublist in a dataset is also referred to as a "row" print() # determine total book sold amongst the top best sellers of all time total = 0 # accumulator for the total number of sales for all books for book in books : # for each in # is also a row in the dataset print( book ) sold = book[ sales_column ] # look at the column ; # we found sales_column earlier; it contains the index of the column called "Sales" in the
variable print( " sold:", sold ) total = total + sold # accumulate the total from the total number of sales of each book print( "total so far:", total ) print( 'total sold:', total ) billions_of_books = total / ( 10 ** 9 ) # moves the decimal point to the left by 9 places # 10 ** 9 = 10 ^ 9 = 1 Billion print( 'total sold:', billions_of_books, 'billion' ) print() # build a list of the book publication dates # want to know when each book was published # which book published first # which book published last # need to isolate the values in the "Dates" column from each book dates = [] # accumulate a list of all the dates from each book for row in books : # for each row in # each row is a list in itself print( row ) year = row[ date_column ] # go to the date_column in and take the info --> put into # we found date_column earlier; it contains the index of the column called "Date" in the
variable print( " year:", year ) dates.append( year ) # how you add to a list; lists do not need to be assigned to a variable # because lists are mutable = can be changed directly print( " dates so far:", dates ) print() print( 'dates:', dates ) # print the final version of after the for loop has finished print() # determine earliest and latest publication date earliest = min( dates ) # min() takes in a sequence inside () and find the minimum value in that sequence latest = max( dates ) # max() takes in a sequence inside () and find the maximum value in that sequence print( 'earliest:', earliest ) print( 'latest :', latest ) print() # determine average publication date # to find sum of the dates, we can do a for loop and accumulate the total date_total = sum( dates ) # sum() is a built-in function; give it a sequence like --> find the total of the values # inside # sequence needs to be a list of int/float only! # [ 1, 2, 3 ] works # [ 1, '2', 3 ] will not nbr_of_dates = len( dates ) # find how many dates there are in the list average_date = date_total // nbr_of_dates print( 'average date:', average_date ) print() # determine earliest and latest published books # here, we must find the books using the dates # to do so need to first find their indices into dates list, those # indices correspond to the row indices into books list row_earliest = dates.index( earliest ) # find which row in has the earliest date; was the date we've found earlier row_latest = dates.index( latest ) # find which row in has the latest date; was the date we've found earlier print( 'row with earliest book:', row_earliest ) print( 'row with latest book :', row_latest ) print() # use those indices to look at corresponding rows into dataset # the indices in match with the indices in !!! # this is because the for loop earlier goes through each row in in order, # then we appended each date in the same order into the list earliest_row = books[ row_earliest ] # this is the book that was published first # here, we found the index of the row with the earliest date in the list # because the index matches, we can use the same index to look into the dataset to get the as list # this will give you the book row that has the earliest date latest_row = books[ row_latest ] # this is the book that was published last ''' books [ dates [ [ "Alice's Adventures in Wonderland", "Carroll", "English", 1865, 100000000 ], 1865, [ "And Then There Were None", "Christie", "English", 1939, 100000000 ], 1939, [ "Dream of the Red Chamber", "Xueqin", "Chinese", 1754, 100000000 ], 1754, [ "Don Quixote", "de Cervantes", "Spanish", 1605, 500000000 ], row_earliest 1605, --> earliest_row in books [ "Harry Potter", "Rowling", "English", 1997, 447000000 ], row_latest 1997, --> latest_row in books [ "The Hobbit", "Tolkien", "English", 1937, 150000000 ], 1937, [ "The Little Prince", "de Saint-Exupery", "French", 1943, 150000000 ], 1943, [ "The Lord of the Rings", "Tolkien", "English", 1954, 150000000 ], 1954, [ "A Tale of Two Cities", "Dickens", "English", 1859, 200000000 ], 1859, ] ] ''' # print those rows print( 'info on earliest:', earliest_row ) print( 'info on latest: ', latest_row ) print() # print just the names of those books name_column = header.index( 'Name' ) # earliest_row = [ "Don Quixote", "de Cervantes", "Spanish", 1605, 500000000 ] # earliest_row[ 0 ] = "Don Quixote" # earliest_row[ 1 ] "de Cervantes" earliest_name = earliest_row[ name_column ] # earliest_row is just defined above; name_column = 0 because that is where # "Name" is in the list
# here we have a row of book information (aka the book that was published first) # it contains Name, Author, etc # since it's a list, we can also peek into each column like Name, Author, Sales, etc. latest_name = latest_row[ name_column ] # same process # You have all the tools already! :) print( 'name of earliest:', earliest_name ) print( 'name of latest: ', latest_name ) ''' '''