import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

US Consumer FInancial Protection Bureau

As an intermediary, the CFPB receives a large number of complaints. To help the CFPB better manage them, they would like to reduce the complaints logged to those that are most likely to be successful. To help them to do this, they would like to be able to indicate to a consumer whether it is likely that their complaint will be accepted. This will help the consumer to decide whether they wish to go through the complaints process, and will reduce the number of complaints logged.

Therefore, your objective is to build a classification model that can be used to:

  1. Determine whether a consumer’s complaint will be accepted and whether they are likely to receive relief.
  2. Help the CFPB understand what factors affect how a company responds to the complaints that it receives.

This investigation is split into three sections:

Exploratory Data Analysis

Classification

Conclusion

On my github page, I have another notebook where I tried some neural net approaches. However, the performance wasn’t as good as Logistic Regression used here.

Exploratory Data Analysis

In this section, we perform some basic data cleaning, and plot the data to understand some of the features.

## if you haven't got the data, you can download it from here
## I've inluded a sample in this repo, just so you can see the code running
df = pd.read_csv('Consumer_complaints.csv')
df.columns = [c.replace(' ','_').lower().replace('-','').replace('?','') for c in df.columns]
df.columns
Index(['date_received', 'product', 'subproduct', 'issue', 'subissue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zip_code', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed', 'complaint_id'],
      dtype='object')
def extract_year_month_date(datetime_series, dateformat=None, is_string=False):
    ## take a datetime series and return a dataframe containing the year, month, 
    ## day and dayofweek
    if dateformat is None:
        infer_date_format = True
    else:
        infer_date_format = False
    
    if is_string:
        datetime_series = pd.to_datetime(datetime_series, infer_datetime_format=infer_date_format, format=dateformat)
#         print(datetime_series)
    colname = datetime_series.name
    year = datetime_series.apply(lambda x: x.year).to_frame().\
                           rename(columns = {colname:'year_{}'.format(colname)})
    month = datetime_series.apply(lambda x: x.month).to_frame().\
                            rename(columns = {colname:'month_{}'.format(colname)})
    day = datetime_series.apply(lambda x: x.day).to_frame().\
                          rename(columns = {colname:'day_{}'.format(colname)})
    dayofweek = datetime_series.apply(lambda x: x.dayofweek).to_frame().\
                                rename(columns = {colname:'dayofweek_{}'.format(colname)})
    return year.join(month).join(day).join(dayofweek).astype(int)

df = df.join(extract_year_month_date(df['date_received'], is_string=True, dateformat="%d/%M/%Y"))
df = df.join(extract_year_month_date(df['date_sent_to_company'], is_string=True, dateformat="%d/%M/%Y"))
# Look at the total complaints for different products, by year 
pd.crosstab(df['product'], df.year_date_received)
year_date_received 2011 2012 2013 2014 2015 2016 2017
product
Bank account or service 0 12212 13388 14662 17140 21849 6956
Checking or savings account 0 0 0 0 0 0 9947
Consumer Loan 0 1986 3117 5457 7888 9602 3558
Credit card 1260 15353 13105 13974 17300 21066 7132
Credit card or prepaid card 0 0 0 0 0 0 11921
Credit reporting 0 1873 14380 29239 34273 44081 16578
Credit reporting, credit repair services, or other personal consumer reports 0 0 0 0 0 0 59186
Debt collection 0 0 11069 39148 39757 40492 41101
Money transfer, virtual currency, or money service 0 0 0 0 0 0 2213
Money transfers 0 0 559 1169 1619 1567 440
Mortgage 1276 38109 49401 42962 42353 41471 26622
Other financial service 0 0 0 116 312 466 165
Payday loan 0 0 194 1706 1586 1567 493
Payday loan, title loan, or personal loan 0 0 0 0 0 0 2245
Prepaid card 0 0 0 336 1784 1250 449
Student loan 0 2840 3005 4283 4501 8087 15896
Vehicle loan or lease 0 0 0 0 0 0 2873
Virtual currency 0 0 0 1 7 7 3
## looking at the list of products, we can see that there's some unification which seems reasonable.

remap_products = {
     # product:remapped_product
    'Credit card':'Credit card or prepaid card',
    'Credit reporting, credit repair services, or other personal consumer reports':'Credit reporting',
    'Money transfers':'Money transfer, virtual currency, or money service',
    'Virtual currency':'Money transfer, virtual currency, or money service',
    'Payday loan, title loan, or personal loan':'Payday loan',
    'Prepaid card':'Credit card or prepaid card', 
    
}

df['product'] = df['product'].apply(lambda x: remap_products.get(x) if remap_products.get(x) else x)
# unify the relief responses
relief = ['Closed with monetary relief',
             'Closed with non-monetary relief',
             'Closed with relief']

df['relief_received'] = df.company_response_to_consumer.apply(lambda x:1 if x in relief else 0)
fig, ax = plt.subplots(figsize=(10,10))
tab = pd.crosstab(df['product'],df['relief_received'])
(tab.T / tab.sum(axis=1)).T.plot(kind='bar', ax=ax)
plt.title('Percentage company responses for each product type.')
# plt.tight_layout()

plt.show()

png

fig, ax = plt.subplots(figsize=(20,10))

df.groupby(['year_date_received','product']).relief_received.mean().unstack().plot(ax=ax)
plt.legend(loc='lower center', ncol=2)
plt.suptitle('Annual relief rate by product')
plt.show()

png

Company size

We observe that most companies have a very small number of complaints, while others have a very large number. Does this affect the relief rate?

print(\
      "{0:.1f}% of companies do less than 100 transactions.".format(\
      (df.groupby('company').count()['product'] < 100).sum() /\
                            df.company.nunique() * 100))
89.4% of companies do less than 100 transactions.
# we note that most companies are very small
company_size = df.groupby('company').count()['product']
company_size_bin = pd.cut(company_size, bins=[0, 100, 1000, 10000, 50000, 100000],
                          labels=['very_small', 'small', 'medium', 'large', 'very_large']).reset_index()
company_size_bin.columns = ['company', 'company_size']
df = df.merge(company_size_bin, on='company')
df['company_relief_rate'] = df.groupby('company').relief_received.transform('mean')

fig, ax = plt.subplots(figsize=(10,10))
df.boxplot(column='company_relief_rate', by='company_size',ax=ax)
plt.show()

png

# We can see that very large companies receive most complaints about bank accounts, credit reporting, and mortgages
pd.crosstab(df['product'], df['company_size'])
company_size very_small small medium large very_large
product
Bank account or service 1000 6115 30471 21322 27299
Checking or savings account 169 655 3625 2559 2939
Consumer Loan 2792 9735 11308 4441 3332
Credit card or prepaid card 572 3801 27592 58072 14893
Credit reporting 1942 5515 5512 3934 182707
Debt collection 36514 58364 57402 15051 4236
Money transfer, virtual currency, or money service 395 831 4960 635 764
Mortgage 7786 15079 58160 87537 73632
Other financial service 324 182 263 149 141
Payday loan 1533 4194 1647 216 201
Student loan 1640 4858 9704 20787 1623
Vehicle loan or lease 278 938 1128 293 236
fig, ax = plt.subplots(figsize=(10,10))
df.groupby(['year_date_received','company_size']).relief_received.mean().unstack().plot(ax=ax)
plt.title('Proportion of complaints in which relief is given \nfor companies of different size, by year')
plt.show()

png

Discussion of exploratory data analysis

From the exploratory analysis above, we can make a few initial observations:

  • Credit cards and prepaid cards, credit reporting, and account services have the highest relief rates, but these are gradually decreasing year-on-year
  • Larger companies have a higher average relief rate, but there is much greater variation in small companies.

Classification

In this task, we are asked to build a classification model that can be used to:

  1. Determine whether a consumer’s complaint will be accepted and whether they are likely to receive relief.
  2. Help the CFPB understand what factors affect how a company responds to the complaints that it receives.

From this statement, we can begin to make plans about what sort of model to use in classification:

  • an interpretable model will be helpful as we will immediately gain an insight into factors affecting company response
  • if the model is not interpretable, we will need to use some kind of model inspection or explanation technique
  • Relevant factors may be found in the text, but also in the metadata of the complaint (product, issue, tag etc.) so we should also include these features in our model.

Baseline

In any classification problem, it’s useful to establish a baseline for further development, and for nlp problems this is usually a bag-of-words input with a linear model - here we will try Logistic Regression, and a Linear SVC.

Data preprocessing

It is standard practice to undertake some cleaning of the text, such as removing stopwords and punctuation; from a brief glance at some samples, we can see that the complaints are anonymised using ‘xxxx’ tokens - these should also be removed. As mentioned above, a bag-of-words model is a standard approach, but we can also use Tf-Idf to give a relevance weighting to tokens within the text.

The following metadata features will also be included: product, issue, sub-issue, company size and tag. We will not include the actual company, as this may lead to overfitting based on the relief rate of companies in the training set, and would not generalise to new companies as they appear in the data.

There is a 4:1 imbalance in the data, which is not too extreme, but should be accounted for. As a minimum, the data must be stratified in the train-test split, and during model development we can explore the use of class weights and oversampling of the data.

df.dropna(subset=['consumer_complaint_narrative'])\
  .relief_received.value_counts() / df.dropna(subset=['consumer_complaint_narrative']).shape[0]
0    0.819553
1    0.180447
Name: relief_received, dtype: float64
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import scipy.stats as stats
np.random.seed(31415)

df_text = df.copy()
df_text.dropna(axis=0,subset=['consumer_complaint_narrative'], inplace=True)
df_text.shape
X = df_text['consumer_complaint_narrative'].str.replace('xx+','')
y = df_text.relief_received
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=31415)

# baseline model for logistic regression and LinearSVC
relief_tags = ['Closed with non-monetary relief',
       'Closed with monetary relief','Closed with relief',]


vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                max_df = 0.5, 
                                min_df = 100)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

models = {'lr':LogisticRegression(class_weight={0:1,1:5}, random_state=31415), 
          'svc':LinearSVC(class_weight={0:1,1:5}, random_state=31415)}
preds = {}

for name, model in models.items():
    print(name)
    model.fit(X_train, y_train)
    preds[name] = model.predict(X_test)
    print(classification_report(y_test, preds[name]))

lr


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)


              precision    recall  f1-score   support

           0       0.91      0.66      0.77     32777
           1       0.31      0.69      0.43      7217

    accuracy                           0.67     39994
   macro avg       0.61      0.68      0.60     39994
weighted avg       0.80      0.67      0.71     39994

svc
              precision    recall  f1-score   support

           0       0.89      0.74      0.81     32777
           1       0.33      0.59      0.42      7217

    accuracy                           0.71     39994
   macro avg       0.61      0.66      0.62     39994
weighted avg       0.79      0.71      0.74     39994



ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)

Leveraging additional meta-data

from scipy.sparse import hstack

ohe = OneHotEncoder(handle_unknown='ignore')
# we won't use company name, as this would lead to overfitting to individual companies
features = ['product','issue','subissue','company_size'] 

X = df_text[['product','issue','subissue','company_size','consumer_complaint_narrative']]
X[['product','issue','subissue']] = X[['product','issue','subissue']].fillna('none')
y = df_text.relief_received
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=31415)

vectorizer_meta_data = CountVectorizer()
X_train_text = vectorizer_meta_data.fit_transform(X_train['consumer_complaint_narrative'])
X_test_text = vectorizer_meta_data.transform(X_test['consumer_complaint_narrative'])
X_train_features = ohe.fit_transform(X_train[features],)
X_test_features = ohe.transform(X_test[features])

X_train = hstack([X_train_text, X_train_features])
X_test = hstack([X_test_text, X_test_features])



SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
model = LogisticRegression(class_weight='balanced', random_state=31415)
model.fit(X_train, y_train)
ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)





LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=31415, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           0       0.91      0.72      0.80     32777
           1       0.34      0.66      0.45      7217

    accuracy                           0.71     39994
   macro avg       0.62      0.69      0.63     39994
weighted avg       0.80      0.71      0.74     39994

Inspecting the Logistic Regression model

feature_melt = []
for feature, categories in zip(features,ohe.categories_):
    melted = [feature] * len(categories)
    feature_melt += melted

categories = np.hstack(ohe.categories_)

coef_df = pd.DataFrame([categories, model.coef_[:,-len(categories):][0], feature_melt]).T
coef_df.columns = ['category', 'coef', 'original_feature']

fig, ax = plt.subplots(figsize=(10,10))
coef_df.boxplot(column='coef', 
           by='original_feature', 
           ax=ax,
           )
plt.suptitle('Boxplot of coefficients for meta-data features of complaints')
plt.show()

png

# Display the coefficients for products
coef_df[coef_df.original_feature == 'product'].sort_values('coef', ascending=False)
category coef original_feature
3 Credit card or prepaid card 0.545969 product
0 Bank account or service 0.326113 product
1 Checking or savings account 0.225128 product
5 Debt collection 0.0556301 product
8 Other financial service 0.0510896 product
2 Consumer Loan 0.0438741 product
11 Vehicle loan or lease -0.001235 product
4 Credit reporting -0.0166776 product
6 Money transfer, virtual currency, or money ser... -0.156185 product
9 Payday loan -0.386861 product
7 Mortgage -0.526208 product
10 Student loan -0.649351 product
# Display the coefficients for company size
coef_df[coef_df.original_feature == 'company_size'].sort_values('coef', ascending=False)
category coef original_feature
381 very_large 0.463117 company_size
379 medium 0.136443 company_size
378 large -0.073706 company_size
382 very_small -0.471874 company_size
380 small -0.542694 company_size
# Display the coefficients for issues
coef_df[coef_df.original_feature == 'issue'].sort_values('coef', ascending=False)[:10]
category coef original_feature
156 Unable to get credit report/credit score 0.875798 issue
134 Problems caused by my funds being low 0.542986 issue
87 Late fee 0.3429 issue
46 Communication tactics 0.326751 issue
159 Unauthorized transactions/trans. issues 0.305247 issue
80 Improper contact or sharing of info 0.284458 issue
101 Managing, opening, or closing account 0.250215 issue
108 Other fee 0.233449 issue
30 Billing disputes 0.198935 issue
56 Credit monitoring or identity protection 0.177035 issue
words = vectorizer.get_feature_names()
word_coefs = pd.DataFrame([words, models['lr'].coef_[0]]).T
word_coefs.columns = ['words','coef']

# Display the words with largest positive coefficients
print(word_coefs.sort_values('coef', ascending=False)[:60])
                   words      coef
2844           jefferson   2.72214
1829             dynamic   2.29037
4377            rushcard   1.91164
1920            enhanced   1.57078
2789          interstate    1.4535
4376                rush    1.3884
3837       professionals  0.830137
1096            citigold  0.801865
5132           universal  0.792687
562   annualcreditreport  0.776771
3585            partners   0.68735
520               allied  0.623358
1019                cbna  0.613705
2586           hurricane   0.58328
1265                conn  0.560628
1256         conflicting  0.557217
4885          technology  0.530956
4185              repaye  0.526669
2435               guide  0.525563
1037       certification  0.522281
3318               multi  0.518445
281                  809  0.499301
548         amortization  0.497155
3485          operations  0.495929
1212          completing  0.493841
715             attitude  0.491621
1016             cavalry   0.48636
4607              solely  0.483301
3075                  lt  0.473938
1957                 erc  0.472124
1672            diligent  0.471448
1811             driving  0.471193
2821                  iq  0.467393
5048           triggered  0.464511
221                  580  0.458511
5210               usury  0.456739
1024              ceased  0.452705
1165            combined  0.452562
2866        jurisdiction  0.451061
3869            proposed  0.447958
2519               hipaa  0.446082
4331               rings  0.440639
5259                 vet  0.439018
4775          subsection  0.435682
4749              strict  0.434813
3480             operate  0.429676
40                 1681b  0.429669
3088                macy  0.427806
3218             midland  0.424827
4313           reversing  0.423636
689          association  0.422513
5321              warned  0.421285
3027              loaded  0.420755
2895                  ky  0.416796
4738        straightened  0.416616
581              apology  0.409819
2038            existent  0.409039
1630           desperate  0.408801
3907          punishment    0.4077
433       administration  0.406148
import re

def inspect_complaints(string, complaints_array, n=20):

        complaints = complaints_array[complaints_array.apply(\
                                     lambda x:True if re.search(string + '\s',x) else False)]
        
        if n > complaints.shape[0]:
            n = complaints.shape[0]
            print('n was too large, printing all available complaints.\n\n')
        for i in range(n):
            complaint = complaints.iloc[i]
            complaint = complaint.replace(string, '[[' + string + ']]')
            print(complaint)
            print('\n\n\n')
            
inspect_complaints('1681b', X.consumer_complaint_narrative)
While checking my personal credit report, I discovered an Unauthorized and Fraudulent credit inquiry made without my KNOWLEDGE or CONSENT by XXXX on or about XXXX/XXXX/2014 on TRANSUNION credit file. I did not authorized or give permission to anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. I am requesting that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus and have them remove the unauthorized and fraudulent hard inquiry immediately.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XXXX on or about XX/XX/XXXX on Transunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XX/XX/XXXX in case it is needed to locate the fraudulent inquiry in their system.




While checking my personal credit report, I discovered an Unauthorized and Fraudulent credit inquiry made without my KNOWLEDGE or CONSENT by XXXX on or about XXXX/XXXX/2015 on TRANSUNION credit file. I did not authorized or give permission to anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. I am requesting that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus and have them remove the unauthorized and fraudulent hard inquiry immediately.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XX/XX/XXXX on or about XX/XX/XXXXon Transunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report. XX/XX/XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XX/XX/XXXX in case it is needed to locate the fraudulent inquiry in their system.




While checking my personal credit report, I discovered an Unauthorized and Fraudulent credit inquiry made without my KNOWLEDGE or CONSENT by XXXX on or about XXXX/XXXX/2014 on TRANSUNION credit file. I did not authorized or give permission to anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. I am requesting that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus and have them remove the unauthorized and fraudulent hard inquiry immediately.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XXXX on or about XX/XX/XXXX on Transunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within XXXX ( XXXX ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XX/XX/XXXX in case it is needed to locate the fraudulent inquiry in their system.




While checking my personal credit report, I discovered an Unauthorized and Fraudulent credit inquiry made without my KNOWLEDGE or CONSENT by XXXX on or about XXXX/XXXX/2014 on TRANSUNION credit file. I did not authorized or give permission to anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. I am requesting that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus and have them remove the unauthorized and fraudulent hard inquiry immediately.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XXXX on or about XXXX/XXXX/XXXXon Transunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report. XXXX XXXX XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XX/XX/XXXX in case it is needed to locate the fraudulent inquiry in their system.




While checking my personal credit report, I discovered an Unauthorized and Fraudulent credit inquiry made without my KNOWLEDGE or CONSENT by XXXX on or about XXXX/XXXX/2014 on TRANSUNION credit file. I did not authorized or give permission to anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. I am requesting that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus and have them remove the unauthorized and fraudulent hard inquiry immediately.




I just checked my personal credit report, just discovered an unauthorized and fraudulent credit inquiry made by XXXX on or about XXXX/XXXX/14 on TRANSUNION. I did not authorized anyone employed by this company or at this company or TRANSUNION to make any inquiry or inquiries and view or show my credit report to anyone, person, company, entity, business, co, corp or similar. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof Bearing my Signature or Request in Writing that I authorized them to view my credit report, then I am demanding that TRANSUNION remove the unauthorized and fraudulent hard credit inquiry immediately from my TRANSUNION credit file.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XXXX on or about XX/XX/XXXX on Transunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XX/XX/XXXX in case it is needed to locate the fraudulent inquiry in their system.




I recently check my Transunion report and it shows an unauthorized Credit Inquiry made from XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, UT XXXX ( XXXX ) XXXX. 

While checking my personal credit report, which I acquired from [ Transunion ] noticed an inquiry made by the company on XX/XX/2017 I did not authorized anyone employed by the company to make an inquiry and view my credit report. it have violated the Fair Credit Reporting Act Section [[1681b]] ( c ) .You are not legally entitled to make the inquiry. This is a serious breach of my privacy rights. 

I request that you either mail me a copy of my signed authorization form that gave you the right to view my credit within five ( 30 ) business daysso that I can verify its validity




XXXX   XXXX  offered me a chance to receive a secured credit card with an opening deposit twice and I was denied both times. I was n't planning on applying for anything during those times as I was repairing my credit. I applied because I was allegedly pre-approved. That 's not fair to the consumer. Neither  XXXX   XXXX  nor the  XXXX  credit bureaus ( TransUnion,  XXXX , and   XXXX    ) will ho nor removing the inquiries even though I was solicited by   XXXX   XXXX  . The FACTS states inquiries can only be pulled under certain conditions and " firm '' offers a re one of those conditions. Here is a reference to the code under ( c ) ( 1 ) ( B ) ( I ) : 15 U.S. Code [[1681b]] - Permissible purposes of consumer reports ( c ) Furnishing reports in connection with credit or insurance transactions that are not initiated by consumer ( 1 ) In general A consumer reporting agency may furnish a consumer report relating to any consumer pursuant to subparagraph ( A ) or ( C ) of subsection ( a ) ( 3 ) in connection with any credit or insurance transaction that is not initiated by the consumer only if ( A ) the consumer authorizes the agency to provide such report to such person ; or ( B ) ( i ) the transaction consists of a firm offer of credit or insurance ; ( ii ) the consumer reporting agency has complied with subsection ( e ) ; ( iii ) there is not in effect an election by the consumer, made in accor  dance with subsection ( e ), to have the consumers name and address excluded from lists of names provided by the agency pursuant to this paragraph ; and ( iv ) the consumer report does not contain a date of birth that shows that the consumer has not attained the age of 21, or, if the date of birth on the consumer report shows that the consumer has not attained the age of 21, suc h consumer consents to the consumer reporting agency to such furnishing.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XXXX on or about XX/XX/XXXX on Transunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XX/XX/XXXX in case it is needed to locate the fraudulent inquiry in their system.




Please check the attached FTC ID Theft report, there are several inquiries on my credit report, they are all listed accordingly on that report. I did not give your company permission to run my credit nor do you have any permissible purpose to run my credit.   Since it is against the FCR A, 604. Permissible purposes of consumer reports [ 15 U.S.C. [[1681b]] ] for an entity  to view a consumers credit report without a permissible purpose. I am writing to inquire as to your alleged purpose for doing so since I did not apply for any credit with your company.   This inquiry was performed under false pretenses as described in the clear language of the la w. 15 USC 1681n ( a ) ( 1 ) ( B ) wh ich states, in part, in the case of liability of a natural person for obtaining a consumer report under false pretenses or knowingly without a permissible purpose, actual damages sustained by the consumer as a result of the failure or {$1000.00}, whichever is greater ;




While checking my personal credit report, I discovered an Unauthorized and Fraudulent credit inquiry made without my KNOWLEDGE or CONSENT by XXXX on or about XXXX/XXXX/2014 on TRANSUNION credit file. I did not authorized or give permission to anyone employed by this company to make any inquiry and view my credit report. XXXX XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. I am requesting that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus and have them remove the unauthorized and fraudulent hard inquiry immediately.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XXXX on or about XX/XX/XXXX onTransunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XX/XX/XXXX in case it is needed to locate the fraudulent inquiry in their system.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XXXX on or about XX/XX/XXXX on Transunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XX/XX/XXXX in case it is needed to locate the fraudulent inquiry in their system.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XXXX on or about XX/XX/XXXX on Transunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report.XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XX/XX/XXXX in case it is needed to locate the fraudulent inquiry in their system.




While checking my personal credit report, I noticed an unauthorized and fraudulent credit inquiry made by XXXX on or about XX/XX/XXXX on Transunion. I did not authorized anyone employed by this company to make any inquiry and view my credit report. XXXX has violated the Fair Credit Reporting Act Section [[1681b]] ( c ). They were not legally entitled to make this fraudulent inquiry. This is a serious breach of my privacy rights. 
I have requested that they mail me a copy of my signed authorization form that gave them the right to view my credit within five ( 5 ) business days so that I can verify its validity and advised them that if they can not provide me with proof that I authorized them to view my credit report then I am demanding that they contact the credit bureaus immediately and have them remove the unauthorized and fraudulent hard inquiry immediately. I also requested that they remove my personal information from their records. My Social Security # is XXXX and my Date of Birth is XXXX in case it is needed to locate the fraudulent inquiry in their system.

Conclusions

  • The Logistic Regression model with metadata features confirms our observations that the products most indicative of receiving relief are credit cards and prepaid cards, bank and checking accounts.
  • The model confirms that larger companies are more likely to award relief.
  • Inspection of complaints text using the model coefficients, reveals some specific issues:
    • The Dodds-Frank law, codes 1681b and 1681g from the US consumer laws
    • Length of complaint may be indicative of likelihood of relief (examples)
    • Company names are often embedded in complaints, so there is a possibility of overfitting to individual companies

Appendices

Parameter Optimisation Code to optimise the logistic regression model using a randomised search.

Thresholding predictions The CFPB can control the level of recall and precision in the model predictions by applying a threshold against the prediction probabilities. In this way, they can understand the trade-off between the reduction in number of complaints they want to achieve using the model, and the proportion of potentially successful complaints they might miss by doing so.

Latent Dirichlet Allocation Topic modelling is a helpful way to explore text data, and is an unsupervised method. pyLDAvis is used to visualise the output of an LDA model.

Parameter optimisation to get a strong model

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer

def xx_remove(X):
    ## function to remove the xx blanks in strings, as this is noise
    fixed = X.str.replace('xx+','')
    return fixed



categorical_features = ['product', 'issue', 'sub-issue', 'company_size']
text_features = 'consumer_complaint_narrative'

categorical_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
    ('scaler', StandardScaler(with_mean=False))
])

text_transformer = Pipeline([
    ('xx', FunctionTransformer(xx_remove)),
    ('vect', CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True)),
    ('tfidf', TfidfTransformer())
])



preprocessor = ColumnTransformer(
transformers=[
    ('cat', categorical_transformer, categorical_features),
    ('text', text_transformer, text_features)
])

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])
parameters = {
    'preprocessor__text__vect__max_df': (0.5, 0.75, 1.0),
    'preprocessor__text__vect__max_features': (None, 5000, 10000, 50000),
    'preprocessor__text__vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'preprocessor__text__tfidf__use_idf': (True, False),
    'preprocessor__text__tfidf__norm': ('l1', 'l2'),
    'preprocessor__cat__scaler':['passthrough', StandardScaler(with_mean=False)],
    'classifier__C':stats.uniform(0.1,10),
    'classifier__class_weight':[{1:w,0:1} for w in range(2,10)]
}

random_search = RandomizedSearchCV(clf, parameters, cv=3, n_iter=10, random_state=31415)
random_search.fit(X_train, y_train, scoring=['f1_score','recall','precision'])

preds = random_search.predict(X_test)
print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           0       0.88      0.82      0.85     10647
           1       0.45      0.57      0.51      2715

    accuracy                           0.77     13362
   macro avg       0.67      0.70      0.68     13362
weighted avg       0.80      0.77      0.78     13362

Oversampling

from imblearn.over_sampling import SMOTE
smote = SMOTE(n_jobs=4)
X_train_smote, y_train_smote = smote.fit_sample(preprocessing.fit_transform(X_train), y_train)
model = LogisticRegression(max_iter=200)
model.fit(X_train_smote, y_train_smote)
preds = model.predict(preprocessing.transform(X_test))
print(classification_report(y_test, preds))

Prediction threshold

preds = randomised_search.predict_proba(X_test)
for i in np.arange(0.3, 1, 0.1):
    print(i,'\n',classification_report(y_test, preds[:,1] > i))

LDA


vect = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                max_df = 0.5, 
                                min_df = 100)

raw_text = df.dropna(subset=['consumer_complaint_narrative'])
raw_text.consumer_complaint_narrative = raw_text.consumer_complaint_narrative.apply(lambda x:x.lower())

lda = LatentDirichletAllocation(n_components=5) # one component for each product

encoded = vect.fit_transform(raw_text.consumer_complaint_narrative[raw_text.relief_received == 1].sample(10000))
lda.fit(encoded)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
from pyLDAvis.sklearn import prepare
import pyLDAvis

vis = prepare(lda, encoded, vect,
#               mds='tsne'
             )

pyLDAvis.enable_notebook()

pyLDAvis.display(vis)


topics = lda.transform(encoded)
text_with_topics = raw_text[raw_text.relief_received == 1]
text_with_topics['top_topic'] = topics.argmax(axis=1)
text_with_topics['top_topic_prob'] = topics.max(axis=1)

text_with_topics.top_topic.value_counts()
pd.crosstab(text_with_topics['issue'], text_with_topics.top_topic).sort_values(by=0, ascending=False)
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
print_top_words(lda, vect.get_feature_names(), n_top_words)