import pandas as pd
import os

# Access Google drive
from google.colab import auth, drive
from google.auth import default
from oauth2client.client import GoogleCredentials

# Access Google spreadsheet
import gspread

# Fourier analysis
import scipy.fftpack

# Other plotting
import matplotlib.pyplot as plt

# Other math
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Access Google Drive for data
# Mount from Google drive
# drive.mount('/content/gdrive')

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
# gc = gspread.authorize(GoogleCredentials.get_application_default())

# Read in the data
# This is a Google trends csv file indicating the popularity of the search term "wheat prices" in India for every month from June 2004 to July 2019, relative to the month of max popularity with level 100
worksheet = gc.open('multiTimeline.csv').sheet1
rows = worksheet.get_all_values()
df = pd.DataFrame.from_records(rows)
#df = pd.read_csv('gdrive/My Drive/multiTimeline.csv.gsheet')

# Read from Google sheet to pandas dataframe
df_search = pd.DataFrame.from_records(rows)

# Rename column's to 3rd line in file
df.rename(columns=df.iloc[2], inplace=True)

# Drop 1st - 3rd lines in file to get the data
df.drop(df.index[0:3], inplace=True)

# Reset the indexes to start at 0, since first few rows were dropped
df.reset_index(drop=True, inplace=True)

# Convert data types to datetime for the months, and ints for the crop price search term value
df['Month'] = pd.to_datetime(df['Month'], format='%Y-%m-%d')
df['wheat prices: (India)'] = pd.to_numeric(df['wheat prices: (India)'], downcast='integer')

# Set the index to the month
df.set_index('Month', inplace=True)
#df.index = pd.to_datetime(df.index)

print(df)

            wheat prices: (India)
Month                            
2004-01-01                      0
2004-02-01                     77
2004-03-01                     99
2004-04-01                     74
2004-05-01                     63
...                           ...
2019-03-01                      1
2019-04-01                      2
2019-05-01                      2
2019-06-01                      1
2019-07-01                      7

[187 rows x 1 columns]

#df.dtypes
#os.listdir('gdrive/My Drive')
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 187 entries, 2004-01-01 to 2019-07-01
Data columns (total 1 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   wheat prices: (India)  187 non-null    int8 
dtypes: int8(1)
memory usage: 1.6 KB

# Make a line plot
df.plot(kind='line', title='Frequency of the search term \'wheat prices\' on Google in India', figsize=(20, 10))

<Axes: title={'center': "Frequency of the search term 'wheat prices' on Google in India"}, xlabel='Month'>

relative_fraction = [monthly_absolute_term_quereies[i] / monthly_total_google_queries[i] for i in range(len(monthly_total_google_queries))]
google_trend_score = [100 *(proportion / max(relative_fraction)) for proportion in relative_fraction]

# Fourier analysis
yf = scipy.fftpack.fft(df)
xf = np.linspace(0.0, 1.0, len(yf))

fig, ax = plt.subplots()
ax.plot(xf, yf)
ax.set_title('Fourier plot of frequencies of this search term')
plt.show()

/usr/local/lib/python3.10/dist-packages/matplotlib/cbook/__init__.py:1335: ComplexWarning: Casting complex values to real discards the imaginary part
  return np.asarray(x, float)

# Make histogram of most common months for this search term

by_month = df.groupby(by=df.index.month)
plt.plot(by_month.mean(), 'g', label='mean of month')
plt.plot(by_month.max(), 'y', linestyle='dashed', label='max month')
plt.plot(by_month.min(), 'y', linestyle='dashed', label='min month')
plt.xlabel('Month of year')
plt.legend()
plt.title('Relative frequency of Google search query "wheat prices" in India by Month')
plt.show()

SELECT year, value
FROM `bigquery-public-data.world_bank_health_population.health_nutrition_population`
WHERE country_name ="India" AND indicator_name="Population, total"
ORDER BY year ASC

# Set the id for using Google cloud api for BigQuery
# project_id = 'learnprojectindiacropprices'

# Population of India over time
# Read from Bigquery public dataset
# df_population = pd.io.gbq.read_gbq('''
#   SELECT year, value
#   FROM `bigquery-public-data.world_bank_health_population.health_nutrition_population`
#   WHERE country_name ="India" AND indicator_name="Population, total"
#   ORDER BY year ASC
# ''', project_id=project_id, dialect='standard')






# Other Exploratory Data Analysus queries to explore this data table:
# SELECT DISTINCT country_name
# FROM `bigquery-public-data.world_bank_health_population.health_nutrition_population`
# ORDER BY country_name

# SELECT DISTINCT indicator_code
# FROM `bigquery-public-data.world_bank_health_population.health_nutrition_population`

# SELECT DISTINCT indicator_name
# FROM `bigquery-public-data.world_bank_health_population.health_nutrition_population`

# Web use: https://cloud.google.com/bigquery/docs/quickstarts/quickstart-web-ui

# Set the id for using Google cloud api for BigQuery
project_id = 'learnprojectindiacropprices'

# Population of India over time
# Read from Bigquery public dataset
df_population_bigquery = pd.io.gbq.read_gbq('''
  SELECT *
  FROM `bigquery-public-data.world_bank_global_population.population_by_country`
  WHERE country = "India"
  ORDER BY country ASC
  LIMIT 1000
''', project_id=project_id, dialect='standard')

df_population = df_population_bigquery.drop(columns=['country', 'country_code']).T
df_population.index = pd.to_datetime(df_population.index, format='year_%Y')
df_population.rename(columns={0: 'population'}, inplace=True)
df_population.index.rename('year', inplace=True)
df_population.dropna(inplace=True)
df_population.head()

df_population.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 59 entries, 1960-01-01 to 2018-01-01
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   population  59 non-null     Int64
dtypes: Int64(1)
memory usage: 1003.0 bytes

df_population.plot(kind='line', title='Population of India', figsize=(20, 10), grid=True)
print("Data for " + str(len(df_population)) + " years, from " + str(df_population.index[0]) + " to " + str(df_population.index[- 1]))

Data for 59 years, from 1960-01-01 00:00:00 to 2018-01-01 00:00:00

# Fit a line to the data

# Split into training and testing sections
# We will use years 1960-1999 as the training set, and years 2000 and after as the testing set
train_data = df_population.iloc[:df_population.index.get_loc(datetime(year = 2000, month = 1, day = 1))]  # Population data before 2000
test_data =  df_population.iloc[ df_population.index.get_loc(datetime(year = 2000, month = 1, day = 1)):] # Population data after 2000

# The linear model can only handle numbers but not datetime
train_data.index = train_data.index.year
test_data.index = test_data.index.year

# Split into x and y (input and output) variables
#train_x = train_data['year']
#train_x = np.array(train_data.index)
#train_y = np.array(train_data['value'])

# Train data
#train_x = train_data.index
#train_y = train_data['value']
train_x = np.array(train_data.index)
train_y = np.array(train_data['population'], dtype=np.int64)

train_x = train_x.reshape(-1, 1)
train_y = train_y.reshape(-1, 1)

# Test data
#test_x = test_data.index
#test_y = test_data['value']
test_x = np.array(test_data.index)
test_y = np.array(test_data['population'], dtype=np.int64)

test_x = test_x.reshape(-1, 1)
test_y = test_y.reshape(-1, 1)

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(train_x, train_y)

# Make predictions using the testing set
pred_y = regr.predict(test_x)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(test_y, pred_y))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(test_y, pred_y))

Coefficients: 
 [[15372279.43001877]]
Mean squared error: 2355098467946090.50
Variance score: 0.71

regr.intercept_

array([-2.97185446e+10])

print("Double checking the linear regression model:")
print("prediction for " + str(test_x[10][0]) + " is " + str(pred_y[10][0]) + " which is around " + str(regr.coef_[0][0]) + " * (" + str(test_x[10][0]) + ") + " + str(regr.intercept_[0]) + " = " + str(regr.coef_[0][0]*test_x[10][0] +regr.intercept_[0]))

Double checking the linear regression model:
prediction for 2010 is 1179737098.61557 which is around 15372279.430018766 * (2010) + -29718544555.72215 = 1179737098.61557

# Plot outputs
#plt.scatter(test_x, test_y,  color='black')
#plt.plot(test_x, pred_y, color='blue', linewidth=3)
#plt.title('Population prediction vs measured')

fig, ax = plt.subplots()
ax.plot(test_x, test_y, 'k*', label='test data')
ax.plot(test_x, pred_y, 'b*:', label='prediction')
plt.title('Population prediction vs measured')

legend = ax.legend(shadow=True, fontsize='x-large')

plt.show()

# Plot the data that the model trained on
fig, ax = plt.subplots()
ax.plot(train_x, train_y, 'b*', label='train data')
ax.plot(train_x, regr.predict(train_x), 'r-', label='linear regression')
plt.title('Trained population model (training data)')

legend = ax.legend(shadow=True, fontsize='x-large')

plt.show()

N = 600
# sample spacing
T = 1.0 / 800.0
x = np.linspace(0.0, N*T, N)
y = np.sin(50.0 * 2.0*np.pi*x) + 0.5*np.sin(80.0 * 2.0*np.pi*x)
yf = scipy.fftpack.fft(y)
xf = np.linspace(0.0, 1.0/(2.0*T), N//2)

fig, ax = plt.subplots()
ax.plot(xf, 2.0/N * np.abs(yf[:N//2]))
plt.show()

x = np.linspace(0.0, 10.0, 100)
y = np.sin(x)
yf = scipy.fftpack.fft(y)
xf = np.linspace(0.0, 1.0/(2.0*T), N//2)

fig, ax = plt.subplots()
ax.plot(x, yf)
plt.show()

	population
year
1960-01-01	450547679
1961-01-01	459642165
1962-01-01	469077190
1963-01-01	478825608
1964-01-01	488848135

David Lohmann¶

India Crop Prices Analysis

Old query, now broken by bugquery update as of October 2023:¶

New, working query as of October 2023:¶

Unrelated: Learning Fourier transform:¶

Useful links¶