The data that I've used here was published in Kaggle;
It is based on public data , collected from the official government website:
The data describes the receipts (or expenses) of 1188 brazillian politicians from (mainly) 2013-2017
%load_ext rpy2.ipython
df.head(3)
bugged_date | receipt_date | deputy_id | political_party | state_code | deputy_name | receipt_social_security_number | receipt_description | establishment_name | receipt_value | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2013-03-27 00:00:00 | 1772 | PSB | SP | Abelardo Camarinha | 3.530749e+12 | Fuels and lubricants. | AUTO POSTO 314 NORTE LTDA | 70 |
1 | 0 | 2013-07-24 00:00:00 | 1772 | PSB | SP | Abelardo Camarinha | 8.202116e+12 | Fuels and lubricants. | AUTO POSTO AEROPORTO LTDA | 104 |
2 | 0 | 2013-02-17 00:00:00 | 1772 | PSB | SP | Abelardo Camarinha | 8.202116e+12 | Fuels and lubricants. | AUTO POSTO AEROPORTO LTDA | 100 |
in order to find outliers we must pivot the data so "receipt_description" will become our columns and an aggregated form of "receipt_value" will become the values in the rows.
thus each row consists of each politicians numerous expenses
# these is a patch that changes R's plot sizes manually/ there is no other way doing this wahen calling R in a python kernel
default_units = 'in' # inch, to make it more easily comparable to matpplotlib
default_res = 150 # dpi, same as default in matplotlib
default_width = 14
default_height = 14
# try monkey-patching a function in rpy2, so we effectively get these
# default settings for the width, height, and units arguments of the %R magic command
import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics
def new_setup_graphics(self, args):
if getattr(args, 'units') is not None:
if args.units != default_units: # a different units argument was passed, do not apply defaults
return old_setup_graphics(self, args)
args.units = default_units
if getattr(args, 'res') is None:
args.res = default_res
if getattr(args, 'width') is None:
args.width = default_width
if getattr(args, 'height') is None:
args.height = default_height
return old_setup_graphics(self, args)
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
large PCA inverse to PCA difference suggests this is an outlier (table in the end of cell)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
df= pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/deputies_dataset.csv')
pd.set_option('display.max_columns', 500)
agg= df.groupby(['receipt_description'])['receipt_value'].agg(mean= 'mean', med='median', std='std')
agg['rsd']= 100*(agg['std']/agg['mean'])
fule= df[df['receipt_description'] == 'Fuels and lubricants.'][['deputy_name','receipt_value']].set_index('deputy_name')
small_df= df[['deputy_name','receipt_description','receipt_value']]
df_t= df.groupby(['deputy_name','receipt_description','receipt_value'])['receipt_value'].mean()
#DataFrame.pivot(index=None, columns=None, values=None)
df_t= small_df.pivot(columns='receipt_description', values='receipt_value').set_index(small_df.deputy_name)
scaler= MinMaxScaler()
scaled_fule= pd.DataFrame(scaler.fit_transform(df_t), columns=df_t.columns).set_index(small_df.deputy_name)
df_new= scaled_fule.stack().reset_index()
df_new.columns= ['deputy_name', 'receipt_description','receipt_value']
df_new_piv= df_new.groupby(['deputy_name', 'receipt_description'])['receipt_value'].max().unstack().fillna(0)
#%%
pca= PCA(8)
df_pca= pd.DataFrame(pca.fit_transform(df_new_piv)).set_index(df_new_piv.index)
pca.explained_variance_ratio_.cumsum()
#df_pca.columns= ['x','y']
#df_pca= df_pca.sort_values(by='x', ascending=False).head(10)
#sns.scatterplot(data=df_pca, x='x', y='y', hue=df_pca.index)
df_after_inverse= pca.inverse_transform(df_pca)
pca_outliers= abs(df_after_inverse- df_new_piv).mean(axis=1).sort_values(ascending=False)
pd.DataFrame(pca_outliers).head(15)
0 | |
---|---|
deputy_name | |
Silas Câmara | 0.142052 |
Sabino Castelo Branco | 0.133315 |
Liderança Do Psdb | 0.124217 |
Marllos Sampaio | 0.116744 |
Joaquim Passarinho | 0.111989 |
... | ... |
Carlos Wilson | 0.004759 |
Cida Diogo | 0.004132 |
Fredo Junior | 0.003910 |
Reguffe | 0.003385 |
Rodovalho | 0.003206 |
1187 rows × 1 columns
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
#%%R
#install.packages('isotree')
%%R
df<- read.csv('/content/drive/MyDrive/Colab Notebooks/deputies_dataset.csv')
library('ggplot2')
library('dplyr')
library('tidyr')
#library('plotly')
library('isotree')
colnames(df)
colnames(df)
str(df)
pol_aggs <- df %>% group_by(deputy_name, receipt_description) %>% summarize(sum= sum(receipt_value), count= n())
df_m <- merge(df, pol_aggs, by=c('deputy_name', 'receipt_description'))
colnames(df)
df_agg<- df %>% group_by (deputy_name,receipt_description) %>% summarise(sum_rec = sum(receipt_value), coun_rec =n(), sum_to_count= sum(receipt_value)/n())
df_agg
R[write to console]: Attaching package: ‘dplyr’ R[write to console]: The following objects are masked from ‘package:stats’: filter, lag R[write to console]: The following objects are masked from ‘package:base’: intersect, setdiff, setequal, union
'data.frame': 3014902 obs. of 10 variables: $ bugged_date : int 0 0 0 0 0 0 0 0 0 0 ... $ receipt_date : chr "2013-03-27 00:00:00" "2013-07-24 00:00:00" "2013-02-17 00:00:00" "2013-03-15 00:00:00" ... $ deputy_id : int 1772 1772 1772 1772 1772 1772 1772 1772 1772 1772 ... $ political_party : chr "PSB" "PSB" "PSB" "PSB" ... $ state_code : chr "SP" "SP" "SP" "SP" ... $ deputy_name : chr "Abelardo Camarinha" "Abelardo Camarinha" "Abelardo Camarinha" "Abelardo Camarinha" ... $ receipt_social_security_number: num 3.53e+12 8.20e+12 8.20e+12 8.20e+12 8.20e+12 ... $ receipt_description : chr "Fuels and lubricants." "Fuels and lubricants." "Fuels and lubricants." "Fuels and lubricants." ... $ establishment_name : chr "AUTO POSTO 314 NORTE LTDA" "AUTO POSTO AEROPORTO LTDA" "AUTO POSTO AEROPORTO LTDA" "AUTO POSTO AEROPORTO LTDA" ... $ receipt_value : int 70 104 100 100 77 131 2840 2517 2509 2720 ... `summarise()` has grouped output by 'deputy_name'. You can override using the `.groups` argument. `summarise()` has grouped output by 'deputy_name'. You can override using the `.groups` argument. # A tibble: 14,265 × 5 # Groups: deputy_name [1,187] deputy_name receipt_description sum_rec coun_rec sum_to_count <chr> <chr> <int> <int> <dbl> 1 Abel Mesquita Jr. Airline tickets 430234 793 543. 2 Abel Mesquita Jr. Consultancies, Researches an… 287000 12 23917. 3 Abel Mesquita Jr. Dissemination of the Parliam… 335400 14 23957. 4 Abel Mesquita Jr. Food and Meals 246 3 82 5 Abel Mesquita Jr. Fuels and lubricants. 112231 171 656. 6 Abel Mesquita Jr. Leasing or Chartering of Boa… 9500 1 9500 7 Abel Mesquita Jr. Maintenance of Office 2451 16 153. 8 Abel Mesquita Jr. Postal Services 1127 18 62.6 9 Abel Mesquita Jr. Rental or Chartering of Moto… 198925 23 8649. 10 Abel Mesquita Jr. Security Service Provided By… 221000 26 8500 # … with 14,255 more rows
to find outliers in :
pred1 : sum of receipts
pred2 : count of receipts
pred3 (most important one) : average value of a receipt
the plot in the end is sum vs count, when size is the avg, so dots high or right with large size are the the one to look for
%%R
#install.packages ("ggforce") # for circle
NULL
%%R
library(ggforce)
df_piv <- spread(df_agg[,-4:-5], receipt_description, sum_rec)
df_piv2 <- spread(df_agg[,c(-3,-5)], receipt_description, coun_rec)
df_piv3 <- spread(df_agg[,c(-3,-4)], receipt_description, sum_to_count) # mean
# fill na :
df_piv[is.na(df_piv)] <- 0
df_piv2[is.na(df_piv2)] <- 0
df_piv3[is.na(df_piv3)] <- 0
# train :
iso <- isolation.forest(df_piv, ntrees = 10, nthreads = 1)
iso2 <- isolation.forest(df_piv2, ntrees = 10, nthreads = 1)
iso3 <- isolation.forest(df_piv3, ntrees = 10, nthreads = 1)
# predict :
pred <- predict(iso, df_piv)
pred2 <- predict(iso2, df_piv2)
pred3 <- predict(iso3, df_piv3)
preds_table <- data.frame(dep= df_piv$deputy_name, pred)
preds_table2 <- data.frame(dep= df_piv$deputy_name, pred2)
preds_table3 <- data.frame(dep= df_piv$deputy_name, pred3)
joined_table <- merge(preds_table, preds_table2 , by='dep')
joined_table <- merge(joined_table, preds_table3, by= 'dep' )
slice(arrange(joined_table,desc(pred3)), 1:10)
options(repr.plot.width = 15, repr.plot.height = 15)
pl<- ggplot(joined_table , aes(x=pred,y=pred2)) + geom_point (aes(size=pred3)) +
geom_text(aes(label=dep ),nudge_y = +0.005 ,data= slice(arrange(joined_table,desc(pred3)), 1:35))+
geom_circle(aes(x0 = 0.56, y0 = 0.36, r = 0.062), color='red',inherit.aes = FALSE)
print(pl)
agg_gen<- df %>% group_by (deputy_name) %>% summarise(mean_rec= mean(receipt_value)) %>% arrange(desc(mean_rec))
agg_gen2<- df %>% group_by (receipt_description) %>% summarise(mean_rec= mean(receipt_value))
summary(agg_gen) # mean = 719
summary(agg_gen2)
joined_table <- joined_table %>% arrange(desc(pred3))
colnames(df_m)
df_m_filt1 <- subset(df_m, deputy_name %in% filter(agg_gen, mean_rec>719)$deputy_name )
df_m_filt2 <- subset(df_m_filt1, deputy_name %in% filter(joined_table, pred3>0.5 )$dep)
lista<-distinct(select (df_m,receipt_description) )
filt1<-lista[rownames(lista)[1:10],]
filt2<-lista[rownames(lista)[11:21],]
df_m_filt2a <- subset(df_m_filt2, receipt_description %in% filt1)
df_m_filt2b <- subset(df_m_filt2, receipt_description %in% filt2)
35 politicans with the largest average expanse are marked, we should focus on those that have also abnormal sum or large count of recipts (latter are mark in red circle )
%%R
#install.packages('ggh4x')
NULL
Check where the expenses went for the top 20 ouliers-- (y - sum of recipts , x- counts)
%%R
library(ggh4x)
pl1<- ggplot(df_m_filt2a, aes(x=count, y=sum, color=deputy_name ) ) +
geom_point(show.legend = T) +
geom_text(aes(label = deputy_name), size= 3, , nudge_y = +1000)+
facet_wrap(.~receipt_description, scales='free')
#force_panelsizes(cols = c(15,15, 15,15, 1,1))
pl1
%%R
pl2<- ggplot(df_m_filt2b, aes(x=count, y=sum, color=deputy_name ) ) +
geom_point(show.legend = T) +
geom_text(aes(label = deputy_name), size=3)+
facet_wrap(.~receipt_description, scales='free')
pl2