“””Investment Management and Machine Learning
Week 3
Instructor:Wei Jiao
“””
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
plt.rcParams[‘figure.figsize’] = [20, 15]
“””
Individual Stock Market Data
Monthly
2001″””
st1=pd.read_stata(“D:/Investment Management and Machine Learning/Week 3/stock
return.dta”)
st1.columns
st1.head()
“””
gvkey: stock id
iid: issue ID
datadate: month end
tic: ticker
cusip: stock id
conm: firm name
primiss: identify primary issue
cshtrm: trading volumn
curcdm: currency
prccm: closing price
trt1m: monthly return
cshom: shares outstanding
exchg: stock exchange code.
tpci: identify common stocks
fic: identifies the country in which the company is incorporated or legally registered
sic: industry classification code
“””
st2=st1[st1[‘tpci’]==’0′] #select common stocks
st3=st2[st2[‘fic’]==’USA’] #select firms in US
st4=st3[st3[‘primiss’]==’P’] #select the primary issue of a firm
st5=st4[(st4[‘exchg’]==11)| (st4[‘exchg’]==12) |(st4[‘exchg’]==14) |(st4[‘exchg’]==17)]
#select those traded in major exchanges
#a file lists the names of stock exchanges is posted on Canvas
st5[‘Year’]=st5[‘datadate’].dt.year
st5[‘Month’]=st5[‘datadate’].dt.month
st5.head()
st5[‘ret’]=st5[‘trt1m’]/100 #monthly return reported in compustat (trt1m) is in percentage.
st6=st5[st5[‘ret’]>=-1] #remove observations missing return
st7=st6[st6[‘cshom’]>0] #remove observations with zero or missing shares outstanding
#plot monthly returns using histgram
plt.hist(st7[‘ret’], bins = 100, range=(-0.5,0.5), rwidth=0.8, color=’black’)
plt.xlabel(‘Return bin’, fontsize=20)
plt.ylabel(‘Frequency’, fontsize=20)
plt.title(‘monthly return’, fontsize=20)
plt.legend([‘monthly return’], fontsize=20)
st7[‘mv_million’]=st7[‘prccm’]*st7[‘cshom’]/1000000
#find the market value=price * shares outstanding
st7[‘share turnover’]=st7[‘cshtrm’]/st7[‘cshom’]
#share turnover= trading volumn/ shares outstanding
plt.hist(st7[‘mv_million’], bins = 100, range=(0,1000), rwidth=0.8, color=’black’)
plt.xlabel(‘Market value bin’, fontsize=20)
plt.ylabel(‘Frequency’, fontsize=20)
plt.title(‘Market value’, fontsize=20)
plt.legend([‘Market value’], fontsize=20)
plt.hist(st7[‘share turnover’], bins = 100, range=(0,1), rwidth=0.8, color=’black’)
plt.xlabel(‘Share turnover bin’, fontsize=20)
plt.ylabel(‘Frequency’, fontsize=20)
plt.title(‘Share turnover’, fontsize=20)
plt.legend([‘Share turnover’], fontsize=20)
st7.to_stata(“D:/Investment Management and Machine Learning/Week 3/stock return
processed.dta”)
a=pd.read_stata(“D:/Investment Management and Machine Learning/Week 3/stock return
processed.dta”)
a.dtypes
“””Read data from Yahoo finance”””
pip install yfinance
“””Install the library”””
import yfinance as yf
from datetime import datetime
start_date = datetime(1990, 1, 1) #set start date
end_date=datetime(2022,12,31)
#read index price information from yahoo finance
index_w1 = yf.download([‘^GSPC’, ‘^IXIC’,’^RUT’], start = start_date, end=end_date)[‘Adj Close’]
#^GSPC:S&P500; IXIC: Nasdaq composite; ^RUT: Rusell 2000
index_w1.reset_index(inplace=True)
index_w1[‘Year’]=index_w1[‘Date’].dt.year
index_w1[‘Month’]=index_w1[‘Date’].dt.month
index_w1.rename(columns={‘^GSPC’:’sp500′,’^IXIC’:’nasdaq’,’^RUT’:’r2000′},inplace=True)
index_w1.sort_values(by=[‘Date’], inplace=True)
#find the daily return
index_w1[[‘sp500_ret_d’,’nasdaq_ret_d’,’r2000_ret_d’]]=index_w1[[‘sp500′,’nasdaq’,’r2000′]].p
ct_change()
#pct_change() calculates the percentage change from the immediately previous row.
#find the monthly return
#(1+r1)*(1+r2)*(1+r3)*…*(1+ri)=(1+R_month)
#R_month=(1+r1)*(1+r2)*(1+r3)*…*(1+ri)-1
index_w1[[‘sp500_ret_d+1′,’nasdaq_ret_d+1′,’r2000_ret_d+1’]]=index_w1[[‘sp500_ret_d’,’nasd
aq_ret_d’,’r2000_ret_d’]]+1
index_w2=index_w1[[‘sp500_ret_d+1′,’nasdaq_ret_d+1′,’r2000_ret_d+1′,’Year’,’Month’]].group
by([‘Year’,’Month’], as_index=False).prod()
# prod() returns the product of the values
#in index_w2, sp500_ret_d+1 is the product of all sp500_ret_d+1 in a month
index_w2[[‘sp500_ret_m’,’nasdaq_ret_m’,’r2000_ret_m’]]=index_w2[[‘sp500_ret_d+1′,’nasdaq
_ret_d+1′,’r2000_ret_d+1’]]-1
index_w3=index_w2[[‘sp500_ret_m’,’nasdaq_ret_m’,’r2000_ret_m’,’Year’,’Month’]]
index_w3[‘Date’]= pd.to_datetime({‘year’: index_w3[‘Year’],’month’:
index_w3[‘Month’],’day’:28})
index_w3.plot(x=’Date’, y=[‘sp500_ret_m’,’nasdaq_ret_m’,’r2000_ret_m’], kind=’line’,
linewidth=2, color=[‘green’,’orange’,’blue’])
plt.xlabel(‘Date’, fontsize=20)
plt.ylabel(‘Monthly return’, fontsize=20)
plt.title(‘Index Monthly Return’, fontsize=20)
plt.legend([‘sp500_ret_m’,’nasdaq_ret_m’,’r2000_ret_m’], fontsize=20)
index_w3.to_stata(“D:\Investment Management and Machine Learning\Week 3\Index
return.dta”,write_index=False)
“””
Financial Statement Data
2000
“””
fs1=pd.read_stata(“D:/Investment Management and Machine Learning/Week 3/quarterly
financial statement.dta”)
#datadate is the fiscal quarter end date.
fs1.columns
fs2=fs1[fs1[‘curncdq’]==’USD’]
#keep obs with positive assets
fs3=fs2[fs2[‘atq’]>0]
#Drop duplicated obs
fs3.drop_duplicates(subset =[‘gvkey’ ,’datadate’], keep = ‘first’, inplace = True)
fs3[‘Year’]=fs3[‘datadate’].dt.year
fs3[‘Month’]=fs3[‘datadate’].dt.month
“”” Macro economic data”””
#interest rate
ir1=pd.read_excel(“D:/Investment Management and Machine Learning/Week 3/treasury
bill.xlsx”)
ir1.dropna(inplace=True)
plt.plot(ir1[‘Date’],ir1[‘DGS10′], color=’green’, linewidth=2)
plt.plot(ir1[‘Date’],ir1[‘DGS3MO’], color=’red’, linewidth=2)
plt.xlabel(‘Date’, fontsize=20)
plt.ylabel(‘Treasury Yield in Percentage’, fontsize=20)
plt.title(‘Treasury Yield’, fontsize=20)
plt.legend([’10-Year’,’3-Month’], fontsize=20)
plt.gca().xaxis.set_major_locator(dates.YearLocator(5, month=12, day=31))
ir1[‘spread’]=ir1[‘DGS10’]-ir1[‘DGS3MO’]
ir1[‘zero’]=0
plt.plot(ir1[‘Date’],ir1[‘spread’], color=’grey’, linewidth=2)
plt.plot(ir1[‘Date’],ir1[‘zero’], color=’blue’, linewidth=2, linestyle=’dashed’)
plt.xlabel(‘Date’, fontsize=20)
plt.ylabel(‘Yield spread in Percentage’, fontsize=20)
plt.title(‘Yield spread’, fontsize=20)
plt.legend([‘Yield spread’], fontsize=20)
plt.gca().xaxis.set_major_locator(dates.YearLocator(5, month=12, day=31))
#GDP growth rate
gdp1=pd.read_excel(“D:/Investment Management and Machine Learning/Week 3/gdp
quarterly growth rate.xlsx”, sheet_name=’real gdp growth rate’)
#The advance estimate of quartelry GDP is released about one month after the quarter end
#Release schedule https://www.bea.gov/news/schedule/full
plt.plot(gdp1[‘Date’],gdp1[‘GDP_Growth’], color=’green’, linewidth=2)
plt.xlabel(‘Date’, fontsize=20)
plt.ylabel(‘Quarterly GDP Growth Rate in Percentage’, fontsize=20)
plt.title(‘Quarterly GDP Growth Rate in Percentage’, fontsize=20)
plt.gca().xaxis.set_major_locator(dates.YearLocator(5, month=12, day=31))
“”” Merge Example”””
sample1=pd.read_excel(“D:/Investment Management and Machine Learning/Week 3/merge
sample 1.xlsx”)
sample2=pd.read_excel(“D:/Investment Management and Machine Learning/Week 3/merge
sample 2.xlsx”)
sample_inner1=pd.merge(sample1, sample2,left_on=[‘Name’], right_on=[‘Name’], how=’inner’)
#left_on and right_on indicate the variables to link two datasets
#inner join keep the intersection of two datasets
sample_left1=pd.merge(sample1, sample2,left_on=[‘Name’], right_on=[‘Name’], how=’left’)
#left join keep all the information in the left datasets
sample_right1=pd.merge(sample1, sample2,left_on=[‘Name’], right_on=[‘Name’], how=’right’)
#right join keep all the information in the right datasets
sample_outer1=pd.merge(sample1, sample2,left_on=[‘Name’], right_on=[‘Name’], how=’outer’)
# outer join keep the union of two datasets
“””
Merge returns, financial statement data, and macroeconomic data
“””
“””
Quarterly financial statement information is from Form 10-Q, available in SEC edgar
Companies can dely their reporting to SEC edgar. But there are deadlines.
https://www.investor.gov/introduction-investing/investing-basics/glossary/form-10-k
To ensure investors can access the financial statement information,
we require that there are at least 90-day gap between fiscal quarter end
and the end of the previous month.Thus, we can obtain the financial statement
information before the beginning of a month.
Dec:fiscal quarter end
Jan: gap
Feb: gap
Mar: gap
April: month of return
“””
st7=pd.read_stata(“D:/Investment Management and Machine Learning/Week 3/stock return
processed.dta”)
stret1=st7[[‘gvkey’, ‘ret’,’datadate’]]
stret1.head()
stret1[‘lagdatadate’]=stret1[‘datadate’]+pd.Timedelta(days=40)+pd.tseries.offsets.BusinessMonthEnd(0)
#pd.Timedelta(days=-40) find date 40 days before datadate
#find the last business day of the previous month
#pd.tseries.offsets.BusinessMonthEnd(0) returns the last business day of a month
stret1.head()
fs4=fs3[[‘gvkey’, ‘datadate’,’atq’,’ceqq’,’dlttq’,’epspiq’]]
fs4.rename(columns={‘datadate’:’date_fs’}, inplace=True)
merge1=pd.merge(stret1, fs4, left_on=[‘gvkey’], right_on=[‘gvkey’], how=’inner’)
#merge based on gvkey
merge1[‘datediff’]=merge1[‘lagdatadate’]-merge1[‘date_fs’]
merge2=merge1[(merge1[‘datediff’]>pd.Timedelta(days=90))
&(merge1[‘datediff’]=pd.Timedelta(days=1))
&(ir3[‘datediff_ir’]=pd.Timedelta(days=25))
&(gdp3[‘datediff_gdp’]