""" Purpose: dataset nuance introduction
"""

# get access to web-based data support
import url 

# set repository folder
DATASET_FOLDER = "http://www.cs.virginia.edu/~cs1112/datasets/csv/"

# set name of the dataset
file_name = "best-sellers.csv"

# specify link
link = DATASET_FOLDER + file_name

# get dataset
table = url.get_dataset( link )
# get_dataset() parses a .csv file using the rules ( row elements separated by commas, rows separated by \n )
# to format our datasets pythonically as a list of lists

# define and print dataset header
header = table[ 0 ] # first row is going to be our header row for this dataset
# the header is a list because table is a list of lists. the first item in the dataset is the first row, which is a list
print( "header:", header )

print()

# define and print dataset books
books = table[ 1 : ] # the second row ( index = 1 ) on is our actual data in the dataset
print( "books:", books )

print()


# determine and print index of names, sales, and date columns of dataset
sales_column = header.index( "Sales" )
name_column  = header.index( "Name" )
date_column  = header.index( "Date" )
# header is a list, so we use the index() function to grab the location of each label
# those labels will correspond to our column numbers in our books dataset, which is a list of lists excluding the
# header row
# **header columns should align with your data columns otherwise you've got a bad dataset on your hands
print( "sales column:", sales_column )
print( "name  column:", name_column )
print( "date  column:", date_column )

print()


# determine total books solds among the best sellers of all time

total = 0 # since we're accumulating a sum, we start it at 0
# accumulation does not magically go away after Test 1, we will continue to build on those topics
for book in books :
    sold = book[ sales_column ] # we found the sales_column above using the index() function
    total = total + sold # since we're accumulating a sum, we say accum = accum + new

print( "total sold:", total )

print()

# build a list of the book publication dates

dates = [] # list accumulator
for row in books :
    # we're looping through books, which is a dataset excluding the header row. each run of this loop row is a different
    # list holding the information of a book. we use the date_column index found earlier to grab out the date and
    # add it to our list of dates
    year = row[ date_column ]
    dates.append( year ) # add to our accumulator using the append() function

print( "dates:", dates )
# dates is a column from our dataset. it is the publication year, stored in each row in the position date_column
print()

# determine earliest and latest publication date
earliest = min( dates )
latest   = max( dates )
# min() and max() operate on dates, which is a list of numbers
# why are the dates already numbers? get_dataset() handled the casting for us

print( "earliest:", earliest )
print( "latest  :", latest )

print()

# determine average publication date

date_total   = sum( dates )   # <--------------------- HUH?
# now that you've gotten through Test 1 we can use the built-in sum() function! yay
nbr_of_dates = len( dates )   

average_date = date_total // nbr_of_dates
# arithmetic average using integer division returns the average year of these best sellers' publication

print( "average date:", average_date )

print()

# determine earliest and latest published books

# to do so need to first find their indices into dates list, those
#     indices correspond to the row indices into books list
row_earliest = dates.index( earliest )
row_latest   = dates.index( latest )
# since we looped through the books dataset in order, the position of the earliest date will give you the row index
# of the earliest book published.
print( "row with earliest book:", row_earliest )
print( "row with latest book  :", row_latest )

print()

# use those indices to look at corresponding rows into books dataset
earliest_row = books[ row_earliest ] # these are lists. we are grabbing 1 list out of the table books (list of lists)
latest_row   = books[ row_latest ]

# print those rows
print( "info on earliest:", earliest_row )
print( "info on latest:  ", latest_row )

print()

# print just the names of those books
name_column  = header.index( "Name" )
# ** index( element ) tells you the number index of where the element is in the list
# ** find( ch ) tells you the number index of where ch is in a string

earliest_name = earliest_row[ name_column ] # here we're grabbing out earlier cells. this is equivalent to
latest_name   = latest_row[ name_column ]   # earliest_name = books[ earliest_row ][ name_column ]
# we can use variables in the brackets to access the element as long as that variable is in range( 0, len( sequence ) )
    # len( list_name ) hands back the number of elements in the list
    # len( string_name ) hands back the number of characters in the list
    # len( dataset_name ) hands back the number of row lists in the table

print( "name of earliest:", earliest_name )
print( "name of latest:  ", latest_name )
"""             <---- will move down to reveal more of the program
"""