# Sam Loyd
#
# Final Project
#
# The analysis of this data set will be performed in R.
#
# The data set was obtained as a compressed archive from Kaggle.
# Data Understanding
# LoanNr_ChkDgt Text Identifier – Primary key, I don't want this to confuse the model
# Name Text Borrower name - Remove for privacy concerns.
# City Text Borrower city
# State Text Borrower state
# Zip Text Borrower zip code
# Bank Text Bank name
# BankState Text Bank state
# NAICS Text North American industry classification system code
# ApprovalDate Date/Time Date SBA commitment issued
# ApprovalFY Text Fiscal year of commitment
# Term Number Loan term in months
# NoEmp Number Number of business employees
# NewExist Text 1 D Existing business, 2 D New business
# CreateJob Number Number of jobs created (Target variable)
# RetainedJob Number Number of jobs retained (*)
# FranchiseCode Text Franchise code, (00000 or 00001) D Nofranchise
# UrbanRural Text 1 D Urban, 2 D rural, 0 D undefined
# RevLineCr Text Revolving line of credit: Y D Yes, N D No
# LowDoc Text LowDoc Loan Program: Y D Yes, N D No
# ChgOffDate Date/Time The date when a loan is declared to be in default (*)
# DisbursementDate Date/Time Disbursement date (*)
# DisbursementGross Currency Amount disbursed (*)
# BalanceGross Currency Gross amount outstanding (*)
# MIS_Status Text Loan status charged off D CHGOFF, Paid in full D PIF (Target variable - Convert to logical)
# ChgOffPrinGr Currency Charged-off amount (*)
# GrAppv Currency Gross amount of loan approved by bank
# SBA_Appv Currency SBA’s guaranteed amount of approved loan
# * warning - future information - most will need to be removed
# Given the nature of this data set it is important that we not let future data
# be used for our model. For example, several fields are based on information after the
# loan was approved. Disbursement is an example.
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(corrplot)
## corrplot 0.84 loaded
library(purrr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:pastecs':
##
## extract
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(mltools)
##
## Attaching package: 'mltools'
## The following object is masked from 'package:tidyr':
##
## replace_na
library(naniar)
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(binaryLogic)
##
## Attaching package: 'binaryLogic'
## The following object is masked from 'package:purrr':
##
## negate
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## The following object is masked from 'package:tidyr':
##
## smiths
library(binaryLogic)
library(RColorBrewer)
library(superml)
## Warning: package 'superml' was built under R version 4.0.3
## Loading required package: R6
# Turn off scientinfic notation
options(scipen=999)
sbaloan_data <- read.csv('SBAnational.csv',na.strings=c("NA","NaN", " ","") )
# glimpse(sbaloan_data)
summary(sbaloan_data)
## LoanNr_ChkDgt Name City State
## Min. :1000014003 Length:899164 Length:899164 Length:899164
## 1st Qu.:2589757508 Class :character Class :character Class :character
## Median :4361439006 Mode :character Mode :character Mode :character
## Mean :4772612311
## 3rd Qu.:6904626505
## Max. :9996003010
##
## Zip Bank BankState NAICS
## Min. : 0 Length:899164 Length:899164 Min. : 0
## 1st Qu.:27587 Class :character Class :character 1st Qu.:235210
## Median :55410 Mode :character Mode :character Median :445310
## Mean :53804 Mean :398661
## 3rd Qu.:83704 3rd Qu.:561730
## Max. :99999 Max. :928120
##
## ApprovalDate ApprovalFY Term NoEmp
## Length:899164 Length:899164 Min. : 0.0 Min. : 0.00
## Class :character Class :character 1st Qu.: 60.0 1st Qu.: 2.00
## Mode :character Mode :character Median : 84.0 Median : 4.00
## Mean :110.8 Mean : 11.41
## 3rd Qu.:120.0 3rd Qu.: 10.00
## Max. :569.0 Max. :9999.00
##
## NewExist CreateJob RetainedJob FranchiseCode
## Min. :0.00 Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.:1.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 1
## Median :1.00 Median : 0.00 Median : 1.0 Median : 1
## Mean :1.28 Mean : 8.43 Mean : 10.8 Mean : 2754
## 3rd Qu.:2.00 3rd Qu.: 1.00 3rd Qu.: 4.0 3rd Qu.: 1
## Max. :2.00 Max. :8800.00 Max. :9500.0 Max. :99999
## NA's :136
## UrbanRural RevLineCr LowDoc ChgOffDate
## Min. :0.0000 Length:899164 Length:899164 Length:899164
## 1st Qu.:0.0000 Class :character Class :character Class :character
## Median :1.0000 Mode :character Mode :character Mode :character
## Mean :0.7577
## 3rd Qu.:1.0000
## Max. :2.0000
##
## DisbursementDate DisbursementGross BalanceGross MIS_Status
## Length:899164 Length:899164 Length:899164 Length:899164
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ChgOffPrinGr GrAppv SBA_Appv
## Length:899164 Length:899164 Length:899164
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
# Clean up any non - unique data - None found
sbaloan_data <- sbaloan_data %>% distinct()
summary(sbaloan_data)
## LoanNr_ChkDgt Name City State
## Min. :1000014003 Length:899164 Length:899164 Length:899164
## 1st Qu.:2589757508 Class :character Class :character Class :character
## Median :4361439006 Mode :character Mode :character Mode :character
## Mean :4772612311
## 3rd Qu.:6904626505
## Max. :9996003010
##
## Zip Bank BankState NAICS
## Min. : 0 Length:899164 Length:899164 Min. : 0
## 1st Qu.:27587 Class :character Class :character 1st Qu.:235210
## Median :55410 Mode :character Mode :character Median :445310
## Mean :53804 Mean :398661
## 3rd Qu.:83704 3rd Qu.:561730
## Max. :99999 Max. :928120
##
## ApprovalDate ApprovalFY Term NoEmp
## Length:899164 Length:899164 Min. : 0.0 Min. : 0.00
## Class :character Class :character 1st Qu.: 60.0 1st Qu.: 2.00
## Mode :character Mode :character Median : 84.0 Median : 4.00
## Mean :110.8 Mean : 11.41
## 3rd Qu.:120.0 3rd Qu.: 10.00
## Max. :569.0 Max. :9999.00
##
## NewExist CreateJob RetainedJob FranchiseCode
## Min. :0.00 Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.:1.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 1
## Median :1.00 Median : 0.00 Median : 1.0 Median : 1
## Mean :1.28 Mean : 8.43 Mean : 10.8 Mean : 2754
## 3rd Qu.:2.00 3rd Qu.: 1.00 3rd Qu.: 4.0 3rd Qu.: 1
## Max. :2.00 Max. :8800.00 Max. :9500.0 Max. :99999
## NA's :136
## UrbanRural RevLineCr LowDoc ChgOffDate
## Min. :0.0000 Length:899164 Length:899164 Length:899164
## 1st Qu.:0.0000 Class :character Class :character Class :character
## Median :1.0000 Mode :character Mode :character Mode :character
## Mean :0.7577
## 3rd Qu.:1.0000
## Max. :2.0000
##
## DisbursementDate DisbursementGross BalanceGross MIS_Status
## Length:899164 Length:899164 Length:899164 Length:899164
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ChgOffPrinGr GrAppv SBA_Appv
## Length:899164 Length:899164 Length:899164
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
# Get rid of $ sign and commas
sbaloan_data$GrAppv <- gsub('[^a-zA-Z0-9.]', '', sbaloan_data$GrAppv)
sbaloan_data$ApprovalFY <- gsub('[^a-zA-Z0-9.]', '', sbaloan_data$ApprovalFY)
# Do we have job creation data for each Year.
# unique(sbaloan_data$ApprovalFY)
CheckFY <- filter(sbaloan_data, `CreateJob` > 0)
NoCheckFY <- filter(sbaloan_data, `CreateJob` == 0)
# Commented out for length
# CheckFY %>% count(ApprovalFY)
# NoCheckFY %>% count(ApprovalFY)
# How many unique banks
#unique(sbaloan_data$Bank)
# Convert GrAppv to numeric
sbaloan_data$GrAppv <- as.numeric(as.character(sbaloan_data$GrAppv))
sbaloan_data$SBA_Appv <- gsub('[^a-zA-Z0-9.]', '', sbaloan_data$SBA_Appv)
sbaloan_data$RevLineCr <- gsub('[^a-zA-Z0-9]', 'UNKNOWN', sbaloan_data$RevLineCr)
# sbaloan_data$RevLineCr
# unique(sbaloan_data$RevLineCr)
# Convert GrAppv to numeric
sbaloan_data$SBA_Appv <- as.numeric(as.character(sbaloan_data$SBA_Appv))
# Domain Knowledge
# Retrieved from https://www.investopedia.com/terms/c/chargeoff.asp
# Quote:
# A charge-off refers to debt that a company believes it will no longer
# collect as the borrower has become delinquent on payments.
# This would be future knowledge as I am interested in loans that
# Retrieved from https://www.investopedia.com/terms/d/disbursement.asp
# Quote:
# A student loan disbursement is the paying out of loan proceeds to a borrower
# Rebrieved from https://www.census.gov/eos/www/naics/faqs/faqs.html#q1
# Quote:
# The North American Industry Classification System (NAICS,
# pronounced Nakes) was developed under the direction
# and guidance of the Office of Management and Budget (OMB)
# as the standard for use by Federal statistical agencies
# in classifying business establishments for the collection,
# tabulation, presentation, and analysis of statistical
# data describing the U.S. economy. Use of the standard provides
# uniformity and comparability in the presentation of
# these statistical data. NAICS is based on a production-oriented
# concept, meaning that it groups establishments
# into industries according to similarity in the processes
# used to produce goods or services.
# NAICS replaced the Standard Industrial
# Classification (SIC) system in 1997.
# head(sbaloan_data)
summary(sbaloan_data)
## LoanNr_ChkDgt Name City State
## Min. :1000014003 Length:899164 Length:899164 Length:899164
## 1st Qu.:2589757508 Class :character Class :character Class :character
## Median :4361439006 Mode :character Mode :character Mode :character
## Mean :4772612311
## 3rd Qu.:6904626505
## Max. :9996003010
##
## Zip Bank BankState NAICS
## Min. : 0 Length:899164 Length:899164 Min. : 0
## 1st Qu.:27587 Class :character Class :character 1st Qu.:235210
## Median :55410 Mode :character Mode :character Median :445310
## Mean :53804 Mean :398661
## 3rd Qu.:83704 3rd Qu.:561730
## Max. :99999 Max. :928120
##
## ApprovalDate ApprovalFY Term NoEmp
## Length:899164 Length:899164 Min. : 0.0 Min. : 0.00
## Class :character Class :character 1st Qu.: 60.0 1st Qu.: 2.00
## Mode :character Mode :character Median : 84.0 Median : 4.00
## Mean :110.8 Mean : 11.41
## 3rd Qu.:120.0 3rd Qu.: 10.00
## Max. :569.0 Max. :9999.00
##
## NewExist CreateJob RetainedJob FranchiseCode
## Min. :0.00 Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.:1.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 1
## Median :1.00 Median : 0.00 Median : 1.0 Median : 1
## Mean :1.28 Mean : 8.43 Mean : 10.8 Mean : 2754
## 3rd Qu.:2.00 3rd Qu.: 1.00 3rd Qu.: 4.0 3rd Qu.: 1
## Max. :2.00 Max. :8800.00 Max. :9500.0 Max. :99999
## NA's :136
## UrbanRural RevLineCr LowDoc ChgOffDate
## Min. :0.0000 Length:899164 Length:899164 Length:899164
## 1st Qu.:0.0000 Class :character Class :character Class :character
## Median :1.0000 Mode :character Mode :character Mode :character
## Mean :0.7577
## 3rd Qu.:1.0000
## Max. :2.0000
##
## DisbursementDate DisbursementGross BalanceGross MIS_Status
## Length:899164 Length:899164 Length:899164 Length:899164
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ChgOffPrinGr GrAppv SBA_Appv
## Length:899164 Min. : 200 Min. : 100
## Class :character 1st Qu.: 35000 1st Qu.: 21250
## Mode :character Median : 90000 Median : 61250
## Mean : 192687 Mean : 149489
## 3rd Qu.: 225000 3rd Qu.: 175000
## Max. :5472000 Max. :5472000
##
sbaloan_data$Month = substr(sbaloan_data$ApprovalDate,3,6)
sbaloan_data$Month <- gsub('[^a-zA-Z]', '', sbaloan_data$Month)
# sbaloan_data$Month
sbaloan_data$Day = substr(sbaloan_data$ApprovalDate,1,2)
sbaloan_data$Day <- gsub('[^[:alnum:]]', '', sbaloan_data$Day)
# sbaloan_data$Day
# Remove future data by dropping columns.
# Future data: BalanceGross, DisbursmentGross, DisbursementDate
# ChgOffDate, ChgOffPrinGr, RetainedJob, and BalanceGross
# ApprovalDate, ApprovalFY, DisbursmentGross, Disbursement Date
# Name - removed for privacy and security
# Approval Fiscal Date and Year removed as future data
# and to avoid problems with predictions at the beginning of a new year.
# City and state - removed as redundant with zip
# Domain Knowledge
# It is also illegal to use local zip in determining loans -
# can lead to racial bias
# Bank geographical information as well .... page 81 (Seigel, 2016)
# Column Removal
sbaloan_data <- subset(sbaloan_data, select = -c(`BankState`,`City`,`State`,`ApprovalDate`,`BalanceGross`,`RetainedJob`,`Name`,`DisbursementGross`,`DisbursementDate`,`ChgOffDate`,`ChgOffPrinGr`))
vis_miss(sbaloan_data,warn_large_data=FALSE) + ggtitle("SMB Missingness Analysis")

# Data preparation
# I am going to keep most of these as categories
# Converting here as its a pain after they are factors
sbaloan_data[is.na(sbaloan_data)]<-'UNKNOWN'
# sbaloan_data$Bank[is.na(sbaloan_data$Bank)] <- " "
# Convert variable as appropriate for analysis
sbaloan_data$Bank = as.factor(sbaloan_data$Bank)
sbaloan_data$NAICS = as.factor(sbaloan_data$NAICS)
sbaloan_data$FranchiseCode = as.factor(sbaloan_data$FranchiseCode)
sbaloan_data$UrbanRural = as.factor(sbaloan_data$UrbanRural)
sbaloan_data$RevLineCr = as.factor(sbaloan_data$RevLineCr)
sbaloan_data$ApprovalFY = as.factor(sbaloan_data$ApprovalFY)
sbaloan_data$Day = as.factor(sbaloan_data$Day)
sbaloan_data$RevLineCr = as.factor(sbaloan_data$RevLineCr)
sbaloan_data$LowDoc = as.factor(sbaloan_data$LowDoc)
sbaloan_data$MIS_Status = as.factor(sbaloan_data$MIS_Status)
sbaloan_data$NewExist = as.factor(sbaloan_data$NewExist)
sbaloan_data$ApprovalFY = as.integer(sbaloan_data$ApprovalFY)
sbaloan_data$Day = as.integer(sbaloan_data$Day)
# Remove loans with unknown status as this is a target variable
sbaloan_data <- filter(sbaloan_data, `MIS_Status` == "CHGOFF" | `MIS_Status` =="P I F")
# df1$IS_PASS = as.factor(df1$IS_PASS)
#glimpse(sbaloan_data)
summary(sbaloan_data)
## LoanNr_ChkDgt Zip Bank
## Min. :1000014003 Min. : 0 BANK OF AMERICA NATL ASSOC : 86773
## 1st Qu.:2593070004 1st Qu.:27612 WELLS FARGO BANK NATL ASSOC : 63461
## Median :4363894001 Median :55416 JPMORGAN CHASE BANK NATL ASSOC: 48131
## Mean :4774981605 Mean :53857 U.S. BANK NATIONAL ASSOCIATION: 35112
## 3rd Qu.:6908644007 3rd Qu.:83706 CITIZENS BANK NATL ASSOC : 33770
## Max. :9996003010 Max. :99999 PNC BANK, NATIONAL ASSOCIATION: 27340
## (Other) :602580
## NAICS ApprovalFY Term NoEmp
## 0 :201667 Min. : 3.00 Min. : 0.0 Min. : 0.00
## 722110 : 27941 1st Qu.:35.00 1st Qu.: 60.0 1st Qu.: 2.00
## 722211 : 19435 Median :40.00 Median : 84.0 Median : 4.00
## 811111 : 14539 Mean :39.14 Mean :110.8 Mean : 11.41
## 621210 : 14034 3rd Qu.:44.00 3rd Qu.:120.0 3rd Qu.: 10.00
## 624410 : 10092 Max. :52.00 Max. :569.0 Max. :9999.00
## (Other):609459
## NewExist CreateJob FranchiseCode UrbanRural
## 0 : 1028 Min. : 0.000 1 :637395 0:322826
## 1 :643446 1st Qu.: 0.000 0 :208040 1:469281
## 2 :252559 Median : 0.000 78760 : 3373 2:105060
## UNKNOWN: 134 Mean : 8.444 68020 : 1921
## 3rd Qu.: 1.000 50564 : 1034
## Max. :8800.000 21780 : 1001
## (Other): 44403
## RevLineCr LowDoc MIS_Status GrAppv
## N :419252 N :780997 CHGOFF :157558 Min. : 1000
## 0 :257431 Y :110171 P I F :739609 1st Qu.: 35000
## Y :200660 UNKNOWN: 2578 UNKNOWN: 0 Median : 90000
## T : 15239 0 : 1490 Mean : 193060
## UNKNOWN: 4534 C : 758 3rd Qu.: 225000
## 1 : 23 S : 603 Max. :5472000
## (Other): 28 (Other): 570
## SBA_Appv Month Day
## Min. : 500 Length:897167 Min. : 1.00
## 1st Qu.: 21250 Class :character 1st Qu.: 8.00
## Median : 62050 Mode :character Median :16.00
## Mean : 149781 Mean :16.01
## 3rd Qu.: 175000 3rd Qu.:23.00
## Max. :5472000 Max. :31.00
##
# Domain Knowledge
# from https://www.sba.gov/sites/default/files/SDOLoanFactSheet_Oct_2011.pdf
# $5 million
# Quote:
# The exact percentage of the guaranty depends on a variety of factors such as size of
# loan and which SBA program is to be used. This will be worked out between the
# SBA and your bank. Amounts - The maximum loan amount is $5 million. The total
# SBA guarantee for any one borrower may not exceed $3,750,000.
# The data shows outliers. Given the information above anything with over 3,750,000 and be imputed
# as likely a data error.
# Correcting for this.
sum(sbaloan_data$SBA_Appv > 3750000)
## [1] 57
# Given the large data set removing 57 rows should be acceptable
sbaloan_data <- filter(sbaloan_data, SBA_Appv < 3750001)
# Further validation
sum(sbaloan_data$GrAppv > 5000000)
## [1] 0
# Returned 0 so the above truncate took care of these as well
# Numeric only for statistics
num_sbaloan_data <- sbaloan_data[,c("Term", "NoEmp", "CreateJob", "SBA_Appv", "GrAppv")]
format(round(stat.desc(num_sbaloan_data,basic = TRUE, norm = FALSE), digits = 3), scientific=FALSE)
## Term NoEmp CreateJob SBA_Appv
## nbr.val 897110.000 897110.000 897110.000 897110.000
## nbr.null 806.000 6617.000 627597.000 0.000
## nbr.na 0.000 0.000 0.000 0.000
## min 0.000 0.000 0.000 500.000
## max 569.000 9999.000 8800.000 3750000.000
## range 569.000 9999.000 8800.000 3749500.000
## sum 99440200.000 10235833.000 7574913.000 134134716152.000
## median 84.000 4.000 0.000 62050.000
## mean 110.845 11.410 8.444 149518.695
## SE.mean 0.083 0.078 0.250 238.793
## CI.mean.0.95 0.163 0.153 0.490 468.027
## var 6224.705 5445.367 56148.933 51155277756.997
## std.dev 78.897 73.793 236.958 226175.325
## coef.var 0.712 6.467 28.063 1.513
## GrAppv
## nbr.val 897110.000
## nbr.null 0.000
## nbr.na 0.000
## min 1000.000
## max 5000000.000
## range 4999000.000
## sum 172944830593.000
## median 90000.000
## mean 192779.961
## SE.mean 296.931
## CI.mean.0.95 581.975
## var 79096532668.695
## std.dev 281241.058
## coef.var 1.459
sample_num_sbaloan_data <- num_sbaloan_data[sample(1:nrow(num_sbaloan_data), 5000,
replace=FALSE),]
format(round(stat.desc(sample_num_sbaloan_data,basic = FALSE, norm = TRUE), digits = 3), scientific=FALSE)
## Term NoEmp CreateJob SBA_Appv GrAppv
## median 84.000 4.000 0.000 60000.000 91450.000
## mean 110.689 14.537 5.549 150184.248 193511.687
## SE.mean 1.117 2.114 2.495 3172.212 3923.145
## CI.mean.0.95 2.190 4.144 4.891 6218.927 7691.085
## var 6239.676 22346.193 31124.306 50314637378.361 76955332679.311
## std.dev 78.992 149.486 176.421 224309.245 277408.242
## coef.var 0.714 10.283 31.791 1.494 1.434
## skewness 1.130 43.905 49.568 3.239 3.091
## skew.2SE 16.318 633.911 715.666 46.760 44.623
## kurtosis 0.217 2053.892 2467.074 16.232 13.273
## kurt.2SE 1.568 14830.098 17813.472 117.201 95.840
## normtest.W 0.841 0.033 0.009 0.643 0.648
## normtest.p 0.000 0.000 0.000 0.000 0.000
# Significant skew and kurtosis. Do not use models that assume a normal distribution.
# Histograms of numeric features
hist(sbaloan_data$Term,
main="Term",
xlab="Months",
col="blue")

hist(sbaloan_data$NoEmp,
main="Employees",
xlab="Count of Employees Histogram",
col="blue")

hist(sbaloan_data$CreateJob,
main="Jobs Created Histogram",
xlab="New Jobs",
breaks = 100,
col="blue")

# Lets filter out the extremes and look
CreateJobAnalysis <- filter(sbaloan_data, `CreateJob` < 100)
hist(CreateJobAnalysis$CreateJob,
main="Histograms of Jobs Created Under 100",
xlab="New Jobs",
breaks = 100,
col="blue")

# Look at loans that created at least 1 job.
CreateJobAnalysis <- CreateJobAnalysis %>%
mutate(CreateJob = ifelse( CreateJob == 0,0,1))
coul <- brewer.pal(5, "Set2")
barplot(table(CreateJobAnalysis$CreateJob),names.arg=c("No Jobs", "At least 1 Job"), col=coul, main="SBA Loan Job Creation", ylab="Count",ylim=c(0,900000))
box()

hist(sbaloan_data$SBA_Appv,
main="SBA Approval Histogram",
xlab="Dollars",
col="blue")

hist(sbaloan_data$GrAppv,
main="Granting Bank Approval Histogram",
xlab="Dollars",
col="blue")

# Skew and kertosis confirmed by visuals
# Box plots of one variable
boxplot(sbaloan_data$Term,
col=(c("gold","darkgreen")),
main="Loan Term Box Plot", xlab="Months")

boxplot(sbaloan_data$NoEmp,
col=(c("gold","darkgreen")),
main="Number of Employees Box Plot", xlab="Employee Count"
)

boxplot(sbaloan_data$CreateJob,
col=(c("gold","darkgreen")),
main="Jobs Created Box Plot", xlab="Employee Count")

boxplot(sbaloan_data$SBA_Appv,
col=(c("gold","darkgreen")),
main="SBA Approval Amount Box Plot", xlab="US DOllars")

boxplot(sbaloan_data$GrAppv,
col=(c("gold","darkgreen")),
main="Granting Bank Approval Amount Box Plot", xlab="US Dollars")

# Data shows significant outliers but domain knowledge does not indicate justification for removal.
# We would not want to not account for some of our largest loans in the model.
# 1 Hot
encoded_sbaloan_data <- sbaloan_data %>% mutate(value = 1) %>% spread(NewExist, value, fill = 0 )
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == '0'] <- 'NewExist_0'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == '1'] <- 'NewExist_1'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == '2'] <- 'NewExist_2'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'UNKNOWN'] <- 'NewExist_U'
encoded_sbaloan_data <- encoded_sbaloan_data %>% mutate(value = 1) %>% spread(UrbanRural, value, fill = 0 )
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == '0'] <- 'UrbanRural_0'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == '1'] <- 'UrbanRural_1'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == '2'] <- 'UrbanRural_2'
encoded_sbaloan_data <- encoded_sbaloan_data %>% mutate(value = 1) %>% spread(Month, value, fill = 0 )
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Jan'] <- 'Month_Jan'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Feb'] <- 'Month_Feb'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Mar'] <- 'Month_Mar'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Apr'] <- 'Month_Apr'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'May'] <- 'Month_May'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Jun'] <- 'Month_Jun'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Jul'] <- 'Month_Jul'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Aug'] <- 'Month_Aug'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Sep'] <- 'Month_Sep'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Oct'] <- 'Month_Oct'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Nov'] <- 'Month_Nov'
names(encoded_sbaloan_data)[names(encoded_sbaloan_data) == 'Dec'] <- 'Month_Dec'
# Create a new field that changes these to binary
encoded_sbaloan_data <- encoded_sbaloan_data %>%
mutate(MIS_logical = ifelse( MIS_Status == "P I F",1,0))
# Bar Graph of Defaults to Paid
coul <- brewer.pal(5, "Set2")
barplot(table(encoded_sbaloan_data$MIS_logical),names.arg=c("Defaults", "Paid"), col=coul, main="SBA Loan Defaults Versus Paid", ylab="Count",ylim=c(0,900000))
box()

summary(encoded_sbaloan_data$MIS_logical)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.8244 1.0000 1.0000
# Label Encoding
sbaloan_data$Bank <- as.integer(sbaloan_data$Bank)
# sbaloan_data$Bank
# Binary Encoding when too many values
# This code segment was taken from https://www.r-bloggers.com/2020/02/a-guide-to-encoding-categorical-features-using-r/
encode_binary <- function(x, order = unique(x), name = "v_") {
x <- as.numeric(factor(x, levels = order, exclude = NULL))
x2 <- as.binary(x)
maxlen <- max(sapply(x2, length))
x2 <- lapply(x2, function(y) {
l <- length(y)
if (l < maxlen) {
y <- c(rep(0, (maxlen - l)), y)
}
y
})
d <- as.data.frame(t(as.data.frame(x2)))
rownames(d) <- NULL
colnames(d) <- paste0(name, 1:maxlen)
d
}
# Binary Encode calling function above
encoded_sbaloan_data <- cbind(encoded_sbaloan_data, encode_binary(encoded_sbaloan_data[["RevLineCr"]], name = "RevLineCr_"))
encoded_sbaloan_data <- cbind(encoded_sbaloan_data, encode_binary(encoded_sbaloan_data[["FranchiseCode"]], name = "FranchiseCode_"))
encoded_sbaloan_data <- cbind(encoded_sbaloan_data, encode_binary(encoded_sbaloan_data[["LowDoc"]], name = "LowDoc_"))
encoded_sbaloan_data <- cbind(encoded_sbaloan_data, encode_binary(encoded_sbaloan_data[["NAICS"]], name = "NAICS_"))
encoded_sbaloan_data <- cbind(encoded_sbaloan_data, encode_binary(encoded_sbaloan_data[["Bank"]], name = "Bank_"))
# Weak correlation and likely illegal for loan consideration so I am removing
# encoded_sbaloan_data <- cbind(encoded_sbaloan_data, encode_binary(encoded_sbaloan_data[["BankState"]], name = "BankState_"))
# encoded_sbaloan_data <- cbind(encoded_sbaloan_data, encode_binary(encoded_sbaloan_data[["Zip"]], name = "Zip_"))
# Remove columns
encoded_sbaloan_data <- subset(encoded_sbaloan_data, select = -c(`LoanNr_ChkDgt`,`Zip`,`NAICS`,`FranchiseCode`,`RevLineCr`,`LowDoc`))
# Remove origial column
encoded_sbaloan_data <- subset(encoded_sbaloan_data, select = -c(`MIS_Status`))
# Remove Bank since it was enocded and Day as it was just a test and didn't help in later processing
encoded_sbaloan_data <- subset(encoded_sbaloan_data, select = -c(`Bank`,`Day`))
# encoded_sbaloan_data <- subset(encoded_sbaloan_data, select = -c(`Month`,`NewExist`))
# glimpse(encoded_sbaloan_data)
summary(encoded_sbaloan_data)
## ApprovalFY Term NoEmp CreateJob
## Min. : 3.00 Min. : 0.0 Min. : 0.00 Min. : 0.000
## 1st Qu.:35.00 1st Qu.: 60.0 1st Qu.: 2.00 1st Qu.: 0.000
## Median :40.00 Median : 84.0 Median : 4.00 Median : 0.000
## Mean :39.14 Mean :110.8 Mean : 11.41 Mean : 8.444
## 3rd Qu.:44.00 3rd Qu.:120.0 3rd Qu.: 10.00 3rd Qu.: 1.000
## Max. :52.00 Max. :569.0 Max. :9999.00 Max. :8800.000
## GrAppv SBA_Appv NewExist_0 NewExist_1
## Min. : 1000 Min. : 500 Min. :0.000000 Min. :0.0000
## 1st Qu.: 35000 1st Qu.: 21250 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 90000 Median : 62050 Median :0.000000 Median :1.0000
## Mean : 192780 Mean : 149519 Mean :0.001146 Mean :0.7172
## 3rd Qu.: 225000 3rd Qu.: 175000 3rd Qu.:0.000000 3rd Qu.:1.0000
## Max. :5000000 Max. :3750000 Max. :1.000000 Max. :1.0000
## NewExist_2 NewExist_U UrbanRural_0 UrbanRural_1
## Min. :0.0000 Min. :0.0000000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.0000000 Median :0.0000 Median :1.000
## Mean :0.2815 Mean :0.0001471 Mean :0.3599 Mean :0.523
## 3rd Qu.:1.0000 3rd Qu.:0.0000000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.0000000 Max. :1.0000 Max. :1.000
## UrbanRural_2 Month_Apr Month_Aug Month_Dec
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.1171 Mean :0.0892 Mean :0.08762 Mean :0.07777
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.00000
## Month_Feb Month_Jan Month_Jul Month_Jun
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.07376 Mean :0.07459 Mean :0.08509 Mean :0.08711
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Month_Mar Month_May Month_Nov Month_Oct
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.09299 Mean :0.08589 Mean :0.07608 Mean :0.0776
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## Month_Sep MIS_logical RevLineCr_1 RevLineCr_2
## Min. :0.00000 Min. :0.0000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:1.0000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.00000 Median :1.0000 Median :0.000000 Median :0.00000
## Mean :0.09231 Mean :0.8244 Mean :0.000029 Mean :0.02207
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.000000 Max. :1.00000
## RevLineCr_3 RevLineCr_4 FranchiseCode_1 FranchiseCode_2
## Min. :0.0000 Min. :0.000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :1.0000 Median :1.000 Median :0.000000 Median :0.000000
## Mean :0.5107 Mean :0.696 Mean :0.001716 Mean :0.005145
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.000 Max. :1.000000 Max. :1.000000
## FranchiseCode_3 FranchiseCode_4 FranchiseCode_5 FranchiseCode_6
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.01001 Mean :0.01568 Mean :0.01845 Mean :0.02627
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## FranchiseCode_7 FranchiseCode_8 FranchiseCode_9 FranchiseCode_10
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02538 Mean :0.03061 Mean :0.02547 Mean :0.02789
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## FranchiseCode_11 FranchiseCode_12 LowDoc_1 LowDoc_2
## Min. :0.0000 Min. :0.0000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.0000 Median :1.0000 Median :0.000000 Median :0.000000
## Mean :0.2591 Mean :0.7405 Mean :0.002213 Mean :0.003629
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.0000 Max. :1.000000 Max. :1.000000
## LowDoc_3 LowDoc_4 NAICS_1 NAICS_2
## Min. :0.0000 Min. :0.0000 Min. :0.000000 Min. :0.00000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :1.0000 Median :0.0000 Median :0.000000 Median :0.00000
## Mean :0.8721 Mean :0.1283 Mean :0.009581 Mean :0.07659
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.000000 Max. :1.00000
## NAICS_3 NAICS_4 NAICS_5 NAICS_6
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1853 Mean :0.2745 Mean :0.3007 Mean :0.3595
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NAICS_7 NAICS_8 NAICS_9 NAICS_10
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.3927 Mean :0.3908 Mean :0.6294 Mean :0.3923
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NAICS_11 Bank_1 Bank_2 Bank_3
## Min. :0.000 Min. :0.000000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.000 Median :0.000000 Median :0.00000 Median :0.00000
## Mean :0.341 Mean :0.007908 Mean :0.04277 Mean :0.07864
## 3rd Qu.:1.000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.000000 Max. :1.00000 Max. :1.00000
## Bank_4 Bank_5 Bank_6 Bank_7
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1407 Mean :0.1674 Mean :0.2415 Mean :0.2897
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Bank_8 Bank_9 Bank_10 Bank_11
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.4432 Mean :0.3277 Mean :0.4854 Mean :0.5921
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Bank_12 Bank_13
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000
## Mean :0.5339 Mean :0.4879
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
# Plot did not exhibit strong linear correlation
plot(encoded_sbaloan_data$NoEmp, encoded_sbaloan_data$CreateJob, main="Scatterplot Existing & Created Jobs",
xlab="Existing Jobs", ylab="Created Jobs", pch=19)

# Plot did not exhibit strong linear correlation
plot(encoded_sbaloan_data$SBA_Appv, encoded_sbaloan_data$CreateJob, main="Scatterplot SBA Loan Amount & Created Jobs",
xlab="SBA Loan Amount US Dollars not adjusted for current value", ylab="Created Jobs", pch=19)

# Spearman was selected for correlation analysis due to non-parametric data.
cor_matrix <- cor(encoded_sbaloan_data,method="spearman")
# Commented out for length will summarize below
# cor_matrix
# eROUT_CIR <- corrplot(cor_matrix, method = "circle", title = "Circle Correlation Matrix Heat Map", mar=c(0,0,1,0), tl.cex=.75)
# Too much to look at, I want a quick scan of relevant values
# The following code does that
# Code modified slightly from
# https://towardsdatascience.com/how-to-create-a-correlation-matrix-with-too-many-variables-309cc0c0a57
# Look at highest values
corr_simple <- function(data=encoded_sbaloan_data,sig=0.25){
#convert data to numeric in order to run correlations
#convert to factor first to keep the integrity of the data - each value will
#become a number rather than turn into NA
df_cor <- data %>% mutate_if(is.character, as.factor)
df_cor <- df_cor %>% mutate_if(is.factor, as.numeric)
#run a correlation and drop the insignificant ones
corr <- cor(df_cor)
#prepare to drop duplicates and correlations of 1
corr[lower.tri(corr,diag=TRUE)] <- NA
#drop perfect correlations
corr[corr == 1] <- NA
#turn into a 3-column table
corr <- as.data.frame(as.table(corr))
#remove the NA values from above
corr <- na.omit(corr)
#select significant values
corr <- subset(corr, abs(Freq) > sig)
#sort by highest correlation
corr <- corr[order(-abs(corr$Freq)),]
#print table
print(corr)
#turn corr back into matrix in order to plot with corrplot
mtx_corr <- reshape2::acast(corr, Var1~Var2, value.var="Freq")
#plot correlations visually
corrplot(mtx_corr, is.corr=FALSE, tl.col="black", na.label=" ")
}
corr_simple()
## Var1 Var2 Freq
## 568 NewExist_1 NewExist_2 -0.9968132
## 3195 LowDoc_3 LowDoc_4 -0.9933737
## 355 GrAppv SBA_Appv 0.9740971
## 2911 FranchiseCode_11 FranchiseCode_12 -0.9240874
## 701 ApprovalFY UrbanRural_0 -0.8321875
## 781 UrbanRural_0 UrbanRural_1 -0.7851538
## 771 ApprovalFY UrbanRural_1 0.6438458
## 2059 RevLineCr_3 RevLineCr_4 -0.5730451
## 352 Term SBA_Appv 0.5295493
## 2697 FranchiseCode_7 FranchiseCode_9 0.5281540
## 2766 FranchiseCode_6 FranchiseCode_10 0.5176347
## 282 Term GrAppv 0.5057383
## 2698 FranchiseCode_8 FranchiseCode_9 0.4680634
## 2767 FranchiseCode_7 FranchiseCode_10 0.4514295
## 2769 FranchiseCode_9 FranchiseCode_10 0.4500151
## 1971 UrbanRural_0 RevLineCr_3 -0.4430693
## 2627 FranchiseCode_7 FranchiseCode_8 0.4409770
## 2696 FranchiseCode_6 FranchiseCode_9 0.4406931
## 2768 FranchiseCode_8 FranchiseCode_10 0.4277323
## 2555 FranchiseCode_5 FranchiseCode_7 0.4238976
## 2556 FranchiseCode_6 FranchiseCode_7 0.4139644
## 2626 FranchiseCode_6 FranchiseCode_8 0.4132253
## 2695 FranchiseCode_5 FranchiseCode_9 0.4061890
## 1961 ApprovalFY RevLineCr_3 0.4041849
## 2414 FranchiseCode_4 FranchiseCode_5 0.3860626
## 2765 FranchiseCode_5 FranchiseCode_10 0.3846901
## 852 UrbanRural_1 UrbanRural_2 -0.3813796
## 2485 FranchiseCode_5 FranchiseCode_6 0.3794746
## 1972 UrbanRural_1 RevLineCr_3 0.3777353
## 2484 FranchiseCode_4 FranchiseCode_6 0.3687704
## 2801 ApprovalFY FranchiseCode_11 0.3598823
## 2625 FranchiseCode_5 FranchiseCode_8 0.3564138
## 2871 ApprovalFY FranchiseCode_12 -0.3560775
## 2764 FranchiseCode_4 FranchiseCode_10 0.3542669
## 2694 FranchiseCode_4 FranchiseCode_9 0.3458109
## 2624 FranchiseCode_4 FranchiseCode_8 0.3435690
## 3692 NAICS_6 NAICS_7 0.3352409
## 2553 FranchiseCode_3 FranchiseCode_7 0.3166439
## 1752 Term MIS_logical 0.3141285
## 2554 FranchiseCode_4 FranchiseCode_7 0.3022303
## 2693 FranchiseCode_3 FranchiseCode_9 0.2993248
## 2763 FranchiseCode_3 FranchiseCode_10 0.2983977
## 2413 FranchiseCode_3 FranchiseCode_5 0.2978764
## 2900 RevLineCr_4 FranchiseCode_12 -0.2942971
## 71 ApprovalFY Term -0.2927300
## 2830 RevLineCr_4 FranchiseCode_11 0.2915160
## 3651 UrbanRural_0 NAICS_7 -0.2871106
## 2343 FranchiseCode_3 FranchiseCode_4 0.2868328
## 3641 ApprovalFY NAICS_7 0.2860179
## 2483 FranchiseCode_3 FranchiseCode_6 0.2839049
## 3851 ApprovalFY NAICS_10 0.2833578
## 3861 UrbanRural_0 NAICS_10 -0.2798738
## 3581 UrbanRural_0 NAICS_6 -0.2787678
## 4827 Bank_10 Bank_12 -0.2780266
## 3571 ApprovalFY NAICS_6 0.2775135
## 851 UrbanRural_0 UrbanRural_2 -0.2730515
## 2811 UrbanRural_0 FranchiseCode_11 -0.2649130
## 3441 UrbanRural_0 NAICS_4 -0.2612105
## 2881 UrbanRural_0 FranchiseCode_12 0.2603173
## 3431 ApprovalFY NAICS_4 0.2593809
## 2623 FranchiseCode_3 FranchiseCode_8 0.2515352
## 3711 ApprovalFY NAICS_8 0.2510739

# Term had the strongest correlation to loan repayment
# None were noticeable for Jobs Created
# Nothing significant for either
# Write out for python
write.csv(encoded_sbaloan_data,"sbapython.csv", row.names = FALSE)
summary(sbaloan_data$CreateJob)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 8.444 1.000 8800.000