our goal is to analyze the dmv's cars fact table-
first we will combine the fact table with the KM's information table and calaculte km per month for each car and monthly km ouliers,
then by joining dim tables and checking changes in car attributes (like car model fuel type and color) trends over the years for general popluation and oultiers by monthly km
lastly we will try to detect suspicous outliers in car owenership type
connecting to SAS with jupyter and SASPY:
import saspy, os
print(saspy.__file__.replace('__init__.py', 'sascfg_personal.py'))
D:\anaconda3\lib\site-packages\saspy\sascfg_personal.py
import saspy
sas = saspy.SASsession(results='HTML')
sas
Using SAS Config named: oda Error trying to read authinfo file:C:\Users\roysh\_authinfo [Errno 2] No such file or directory: 'C:\\Users\\roysh\\_authinfo' Did not find key oda in authinfo file:C:\Users\roysh\_authinfo
Please enter the IOM user id: u61485743 Please enter the password for IOM user : ········
SAS Connection established. Subprocess id is 5896
Access Method = IOM SAS Config name = oda SAS Config file = D:\anaconda3\lib\site-packages\saspy\sascfg_personal.py WORK Path = /saswork/SAS_work351A00003983_odaws02-euw1.oda.sas.com/SAS_work655C00003983_odaws02-euw1.oda.sas.com/ SAS Version = 9.04.01M6P11072018 SASPy Version = 4.2.0 Teach me SAS = False Batch = False Results = HTML SAS Session Encoding = utf-8 Python Encoding value = utf-8 SAS process Pid value = 14723
%%SAS
libname fp "/home/u61485743";
options obs= max;
options nosource;
*****************************************************************************;
********************************* Assignment 1 ***********************************;
*****************************************************************************;
*
data fp.fact_cars_reg / NOLIST;
infile "/home/u61485743/sas_course/mia_cars_reg_*.txt" dsd flowover
delimiter='|' eov=eov firstobs=2 ;
length tozeret_cd $ 4 sug_degem_cd $ 1 baalut_cd $ 1
sug_delek_cd $ 2 mispar_rechev $ 8 moed_aliya_lakvish $ 15;
array col_to_upper sug_delek_cd baalut_cd;
input @; /* for every input line: read the line to check for eov flag */
/* when new file starts : EOV turns to 1 */
if eov=1 then input; /* then skips first input line (header) for the new file */
/* else: regular input line */
input mispar_rechev $ tozeret_cd $ sug_degem_cd $ ramat_eivzur_betihuty
kvutzat_zihum shnat_yitzur baalut_cd $ tzeva_cd sug_delek_cd $
moed_aliya_lakvish $ mivchan_acharon_dt: YYMMDD10.;
format mivchan_acharon_dt YYMMDD10.;
tozeret_cd = cat(repeat('0',4-length(tozeret_cd)-1),tozeret_cd);
do over col_to_upper ;
col_to_upper= upcase(col_to_upper);
end;
eov= 0; /* resets the eov flag */
drop eov;
run;
proc contents data=fp.fact_cars_reg;
run;
/* */
/* proc freq data=fp.fact_cars_reg; */
/* tables sug_delek_cd baalut_cd; */
/* run; */
The CONTENTS Procedure
Data Set Name | FP.FACT_CARS_REG | Observations | 732264 |
---|---|---|---|
Member Type | DATA | Variables | 11 |
Engine | V9 | Indexes | 0 |
Created | 06/20/2022 10:23:00 | Observation Length | 72 |
Last Modified | 06/20/2022 10:23:00 | Deleted Observations | 0 |
Protection | Compressed | NO | |
Data Set Type | Sorted | NO | |
Label | |||
Data Representation | SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64 | ||
Encoding | utf-8 Unicode (UTF-8) |
Engine/Host Dependent Information | |
---|---|
Data Set Page Size | 131072 |
Number of Data Set Pages | 404 |
First Data Page | 1 |
Max Obs per Page | 1816 |
Obs in First Data Page | 1772 |
Number of Data Set Repairs | 0 |
Filename | /home/u61485743/fact_cars_reg.sas7bdat |
Release Created | 9.0401M6 |
Host Created | Linux |
Inode Number | 90309990 |
Access Permission | rw-r--r-- |
Owner Name | u61485743 |
File Size | 51MB |
File Size (bytes) | 53084160 |
Alphabetic List of Variables and Attributes | ||||
---|---|---|---|---|
# | Variable | Type | Len | Format |
3 | baalut_cd | Char | 1 | |
8 | kvutzat_zihum | Num | 8 | |
5 | mispar_rechev | Char | 8 | |
11 | mivchan_acharon_dt | Num | 8 | YYMMDD10. |
6 | moed_aliya_lakvish | Char | 15 | |
7 | ramat_eivzur_betihuty | Num | 8 | |
9 | shnat_yitzur | Num | 8 | |
2 | sug_degem_cd | Char | 1 | |
4 | sug_delek_cd | Char | 2 | |
1 | tozeret_cd | Char | 4 | |
10 | tzeva_cd | Num | 8 |
%%SAS
libname fp "/home/u61485743";
*****************************************************************************;
********************************* Assignment 2 ***********************************;
*****************************************************************************;
**** 1: adding new mispar_rechev col with the "-" notation;
**** 2: creating 2 cases: 7 chars "XXX-XX-XXX" mispar_rechev and 8 chars a "XX-XXX-XX"
**** 3: if 7 chars : concats without starting zero ;
DATA fp.fact_cars_reg/ NOLIST;
length mispar_rechev_fmt $ 10;
set fp.fact_cars_reg;
if substr(mispar_rechev,1,1) = "0" then do;
mispar_rechev_fmt = cats(substr(mispar_rechev,2,2), "-" ,
substr(mispar_rechev,4,3), "-", substr(mispar_rechev,7,2));
end;
else if length(mispar_rechev) = 8 then do;
mispar_rechev_fmt = cats(substr(mispar_rechev,1,3), "-" ,
substr(mispar_rechev,4,2), "-",
substr(mispar_rechev,6,3));
end;
run;
49 The SAS System Monday, June 20, 2022 09:34:00 AM
E3969440A681A2408885998500000025
%%SAS
libname fp "/home/u61485743";
*****************************************************************************;
********************************* Assignment 3 ***********************************;
*****************************************************************************;
/* palindroms precent for tozeret_cd over 1000 car count */;
/* strip*/
proc sql;
create table fp.palindroms as
select tozeret_cd,
(a.count_palind / a.count_vehec) as percent_palindrom format percent8.3,
a.count_vehec,
a.count_palind
from (
select tozeret_cd,
count(mispar_rechev) as count_vehec,
case when count(mispar_rechev) >= 1000 then 1 else 0 end as check_1000,
sum(case when compress(strip(mispar_rechev_fmt),"-") =
reverse(compress(strip(mispar_rechev_fmt),"-"))
then 1 else 0 end) as count_palind
from fp.fact_cars_reg
group by tozeret_cd
having check_1000 =1
)a
order by percent_palindrom desc;
quit;
proc print data=fp.palindroms (obs=20);
run;
Obs | tozeret_cd | percent_palindrom | count_vehec | count_palind |
---|---|---|---|---|
1 | 0430 | 0.232% | 10355 | 24 |
2 | 0488 | 0.225% | 1333 | 3 |
3 | 0940 | 0.162% | 4325 | 7 |
4 | 0603 | 0.148% | 8086 | 12 |
5 | 0593 | 0.094% | 8492 | 8 |
6 | 0830 | 0.091% | 1102 | 1 |
7 | 0152 | 0.089% | 1123 | 1 |
8 | 0771 | 0.060% | 18458 | 11 |
9 | 0590 | 0.045% | 31378 | 14 |
10 | 0253 | 0.024% | 20813 | 5 |
11 | 0624 | 0.024% | 8474 | 2 |
12 | 0351 | 0.023% | 8737 | 2 |
13 | 0981 | 0.021% | 4761 | 1 |
14 | 0481 | 0.016% | 51336 | 8 |
15 | 0683 | 0.009% | 10579 | 1 |
16 | 0672 | 0.007% | 14148 | 1 |
17 | 0839 | 0.006% | 46286 | 3 |
18 | 0299 | 0.006% | 15461 | 1 |
19 | 0845 | 0.006% | 33047 | 2 |
20 | 0413 | 0.005% | 21578 | 1 |
first importing the extra data:
%%SAS
libname fp "/home/u61485743";
*****************************************************************************;
********************************* Assignment 4 ***********************************;
*****************************************************************************;
/* importing 2 dim tables : xlsx and csv files
/* changing formats and lenghts
/* creating 3 dim tables by manual typing inputs -> datalines */
proc import datafile= "/home/u61485743/sas_course/dim_tozeret.xlsx"
dbms = xlsx
out= work.dim_tozeret_temp
replace;
run;
/* */
/* this table is not avilable */
/* proc import datafile= "S:\workshop\MIA\mia_dict_colors.csv" */
/* dbms = csv */
/* out= work.dict_colors_temp */
/* replace; */
/* guessingrows= max; */
/* run; */
/* */
proc sql;
create table fp.dim_tozeret as
select put(tozeret_cd ,z4.) as tozeret_cd , /* numeric to char */
put(tozeret_nm, $CHAR20.) as tozeret_nm /*char to char */
/* put(tozeret_country_cd, $CHAR2.) as tozeret_country_cd /*char to char */
from dim_tozeret_temp;
quit;
/* create table fp.dim_tzeva as */
/* select tzeva_cd , /* numeric to char */
/* put(tzeva_nm, $CHAR20.) as tzeva_nm, /*char to char */
/* put(tzeva_type, $CHAR10.) as tzeva_type /*char to char */
/* from dict_colors_temp; */
/* run; */
/* */
data fp.dim_degem / NOLIST;
input +1 sug_degem_cd $1. sug_degem_nm $20. ;
cards;
P Private
M Commercial
;
run;
data fp.dim_baalut / NOLIST;
input +1 baalut_cd $1. baalut_nm $20. ;
datalines;
R Rent
C Company
L Leasing
P Private
D Dealer
;
run;
/* exe sheet indicates length 1 to delek_cd ,
/* I've changed to length 2 to store all the input correctly */
data fp.dim_delek / NOLIST;
input +1 delek_cd $2. delek_nm $20. ;
datalines;
G Gasoline
L LPG
D Diesel
E Electric
EG Electric\Gasoline
ED Electric\Diesel
;
run;
50 The SAS System Monday, June 20, 2022 09:34:00 AM
E3969440A681A2408885998500000026
%%SAS
libname fp "/home/u61485743";
*****************************************************************************;
********************************* Assignment 5 ***********************************;
*****************************************************************************;
proc sql;
create table fact_cars_reg_flat_temp (drop= delek_cd)
as
select *
from fp.fact_cars_reg f
left join fp.dim_tozeret d1
on f.tozeret_cd = d1.tozeret_cd
/* left join fp.dim_tzeva d2 */
/* on f.tzeva_cd = d2.tzeva_cd */
left join fp.dim_degem d3
on f.sug_degem_cd = d3.sug_degem_cd
left join fp.dim_baalut d4
on f.baalut_cd= d4.baalut_cd
left join fp.dim_delek d5
on f.sug_delek_cd= d5.delek_cd;
quit;
run;
/* fill nulls in character and numeric columns */
/* using arrays to store the data and do-loops for less manual work */
data fp.fact_cars_reg_flat;
set fact_cars_reg_flat_temp;
array num_cols _numeric_;
array cat_cols _character_;
do over cat_cols;
if missing(cat_cols)
then cat_cols= "Unknown";
end;
do over num_cols;
if missing(num_cols)
then num_cols = -1 ;
end;
run;
proc report data=fp.fact_cars_reg_flat (obs=10);
run;
mispar_rechev_fmt | tozeret_cd | sug_degem_cd | baalut_cd | sug_delek_cd | mispar_rechev | moed_aliya_lakvish | ramat_eivzur_betihuty | kvutzat_zihum | shnat_yitzur | tzeva_cd | mivchan_acharon_dt | tozeret_nm | sug_degem_nm | baalut_nm | delek_nm |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
112-33-302 | 0496 | M | P | D | 11233302 | 2020-9 | 4 | 13 | 2020 | 21 | 2022-03-13 | Unknown | Commercial | Private | Diesel |
831-23-601 | 0624 | M | P | D | 83123601 | 2019-8 | 3 | 15 | 2019 | 80 | 2021-07-12 | nissan | Commercial | Private | Diesel |
282-89-001 | 0496 | M | P | D | 28289001 | 2018-3 | -1 | 9 | 2018 | 80 | 2022-03-17 | Unknown | Commercial | Private | Diesel |
540-76-901 | 0806 | M | P | D | 54076901 | 2019-6 | -1 | 15 | 2019 | 80 | 2019-06-05 | Unknown | Commercial | Private | Diesel |
860-07-601 | 0771 | P | R | D | 86007601 | 2019-12 | 3 | 6 | 2019 | 80 | 2021-11-01 | renaullt | Private | Rent | Diesel |
435-99-701 | 0496 | M | P | D | 43599701 | 2018-7 | -1 | 9 | 2018 | 80 | 2021-07-27 | Unknown | Commercial | Private | Diesel |
397-08-001 | 0724 | M | P | D | 39708001 | 2018-5 | -1 | 15 | 2018 | 80 | 2021-04-21 | Unknown | Commercial | Private | Diesel |
719-06-201 | 0973 | M | P | D | 71906201 | 2020-1 | -1 | 15 | 2020 | 80 | 2021-07-04 | Unknown | Commercial | Private | Diesel |
436-76-901 | 0496 | M | P | D | 43676901 | 2018-8 | -1 | 9 | 2018 | 80 | 2021-08-28 | Unknown | Commercial | Private | Diesel |
645-27-201 | 0496 | M | P | D | 64527201 | 2019-7 | -1 | 9 | 2019 | 80 | 2021-06-27 | Unknown | Commercial | Private | Diesel |
%%SAS
libname fp "/home/u61485743";
*****************************************************************************;
********************************* Assignment 6 ***********************************;
*****************************************************************************;
filename km "/home/u61485743/sas_course/mia_km_last_test.txt" ;
data fp.km_last_test;
infile km
delimiter=',' firstobs=2 ;
length mispar_rechev $ 8;
input mispar_rechev $ mivchan_aharon_km;
mispar_rechev= cat(repeat('0',8-length(mispar_rechev)-1),mispar_rechev);
run;
proc sql outobs = 2000;
create table test_rand as
select * from fp.fact_cars_reg
order by RANUNI (42);
quit;
proc sql outobs=50;
select f.mispar_rechev ,d.mispar_rechev
,d.mivchan_aharon_km
from test_rand f
left join fp.km_last_test d
on f.mispar_rechev = d.mispar_rechev;
quit;
mispar_rechev | mispar_rechev | mivchan_aharon_km |
---|---|---|
02330222 | 02330222 | 30879 |
06008426 | . | |
06037226 | . | |
06105726 | . | |
06211526 | . | |
06219926 | . | |
06392526 | . | |
06508826 | . | |
07175526 | . | |
07233384 | 07233384 | 88844 |
07303726 | . | |
07453926 | . | |
07464826 | . | |
07533226 | . | |
07559526 | . | |
07566526 | . | |
07620626 | . | |
07637826 | . | |
07651726 | . | |
07701726 | . | |
07707326 | . | |
07753926 | . | |
07755426 | . | |
07803026 | . | |
07857226 | . | |
07870626 | . | |
07872126 | . | |
07872426 | . | |
07904726 | . | |
07915526 | . | |
07917326 | . | |
07918826 | . | |
08486580 | 08486580 | 129200 |
08784526 | . | |
08950826 | . | |
08993726 | . | |
10671202 | 10671202 | 0 |
10681002 | 10681002 | 0 |
10717402 | 10717402 | 0 |
10740602 | 10740602 | 0 |
10785302 | 10785302 | 0 |
10804502 | 10804502 | 0 |
10984502 | . | |
11224402 | 11224402 | 0 |
11240102 | . | |
11244602 | 11244602 | 0 |
11326202 | 11326202 | 0 |
11456502 | 11456502 | 0 |
11625502 | 11625502 | 56730 |
11631802 | 11631802 | 0 |
with calculation of:
-aliya_lakvish- start month and mivchan_achcron- end month (intnx functin)
-months between start month and end month (intck fucntion)
-avg km per month
-values where calcualtions were not feasible replaced with nulls */
%%SAS
libname fp "/home/u61485743";
*****************************************************************************;
********************************* level 7 ***********************************;
*****************************************************************************;
/* 1 - joining km dim table to fact table
with calculation of:
-aliya_lakvish- start month and mivchan_achcron- end month (intnx functin)
-months between start month and end month (intck fucntion)
-avg km per month
-values where calcualtions were not feasible replaced with nulls */
/* 2 - summary table of the above*/
/* 3 - distribution plot of the above (avg km per month) without outliers */
/* 4 - % of non-calculated (nulls, zeroes and negative values) vs calaculated of avg kp per month*/
proc sql ;
create table fp.fact_avg_km as
select d.mispar_rechev as mispar_rechev_dim , f.*
,d.mivchan_aharon_km
,intnx('month',input(f.moed_aliya_lakvish , anydtdte.),1) as start_date
/* 'b' --> start of month */
,intnx('month',mivchan_acharon_dt ,0,'b') as end_date
,mivchan_acharon_dt
,intck('month',calculated start_date ,calculated end_date,'d') as months_passed
,mivchan_aharon_km / intck('month',calculated start_date ,calculated end_date,'d') as avg_monthly_km_tmp
,case when calculated months_passed > 0 and calculated avg_monthly_km_tmp > 0
then calculated avg_monthly_km_tmp end as avg_monthly_km
from FP.FACT_CARS_REG_FLAT f
left join fp.km_last_test d
on f.mispar_rechev = d.mispar_rechev;
quit;
/* find number of nulls in any of the 2 tables - fact or dim */
proc sql;
create table nulls_in_tables as
select
sum(case when f.mispar_rechev is null then 1 end) as num_nulls_in_dim,
sum(case when mivchan_aharon_km is null then 1 end) as num_nulls_in_fact
from FP.FACT_CARS_REG_FLAT f
full join fp.km_last_test d
on f.mispar_rechev = d.mispar_rechev;
quit; /* no nulls in dim! */
proc print data= nulls_in_tables;
run;
/* summary table and distribution plot */
proc sql ;
create table avg_km_summary as
select mispar_rechev,
put(start_date, YYMMDD10.) as start_date,
put(end_date ,YYMMDD10.) as end_date,
months_passed,
avg_monthly_km
from fp.fact_avg_km;
quit;
title "avg montly km distribution by count of cars";
proc sgplot data= avg_km_summary;
histogram avg_monthly_km /nbins=20 scale=count ;
density avg_monthly_km;
where avg_monthly_km > 0 and avg_monthly_km <5000;
run;
title;
/* non-calculated vs calculated */
proc sql;
create table fp.avg_km_agg as
select
count(*) as num_cars ,
sum(case when avg_monthly_km =. then 1 end)
/ count(*) as percent_uncalculated foramt percent8.3,
sum(case when avg_monthly_km ~=. then 1 end)
/ count(*) as percent_calculated foramt percent8.3,
sum (case when months_passed <= 0 or months_passed is missing then 1 end) as negative_or_null_months, /*negative or months */
sum(case when (mivchan_aharon_km <=0 or mivchan_aharon_km is missing) and months_passed>0 then 1 end) as months_without_km, /*months but no km */
sum(case when avg_monthly_km is missing then 1 end) as total_missing
from
fp.fact_avg_km ;
quit;
proc report data= fp.avg_km_agg;
run;
Obs | num_nulls_in_dim | num_nulls_in_fact |
---|---|---|
1 | . | 36695 |
num_cars | percent_uncalculated | percent_calculated | negative_or_null_months | months_without_km | total_missing |
---|---|---|---|---|---|
732264 | 42.22% | 57.78% | 266336 | 42806 | 309142 |
we can see in the avg_monthly_km Histogram above the general distribuition
כיוון שלא ניתן להביא את הטבלה במלואה (700 אלף ערכים ) אני מצרף כאן למעלה היסטוגרמה של התפלגות ק"מ ממוצעים לחודש
(כלומר ספירת כמות הרכבים בכל bin)
בניכוי ערכים קיצוניים מאוד ,קרי תמונה מצב כללית
מתחת לגרף ההיסטוגרמה ישנה טבלת סיכום אחוז הרכבים שניתן היה לבצע להם חישוב לעומת אלו שלא ניתן היה
%%SAS
libname fp "/home/u61485743";
*****************************************************************************;
********************************* level 8 ***********************************;
*****************************************************************************;
proc univariate data= fp.fact_avg_km ;
var avg_monthly_km;
run;
/* adding columns to lable the ouliers/ nob-outliers and aggregate the counts for each year */
proc sql;
create table fp.km_outliers as
select *, avg(avg_monthly_km),
median(avg_monthly_km),
STD(avg_monthly_km),
count(*) as total_count,
case when shnat_yitzur = 2018
and avg_monthly_km >3000 then "outlier"
when avg_monthly_km <=3000 then "non-outlier" end as is_outlier_2018,
sum(case when calculated is_outlier_2018 = "outlier" then 1 end) as count_outlier_2018,
sum(case when calculated is_outlier_2018 = "non-outlier" then 1 end) as count_non_outlier_2018,
case when shnat_yitzur = 2019
and avg_monthly_km >3000 then "outlier"
when avg_monthly_km <=3000 then "non-outlier" end as is_outlier_2019,
case when shnat_yitzur = 2020
and avg_monthly_km >3000 then "outlier"
when avg_monthly_km <=3000 then "non-outlier" end as is_outlier_2020,
sum(case when calculated is_outlier_2019 = "outlier" then 1 end) as count_outlier_2019,
sum(case when calculated is_outlier_2020 = "outlier" then 1 end) as count_outlier_2020
from fp.fact_avg_km
where avg_monthly_km is not missing
and sug_degem_cd= 'P'
;
quit;
/* creating a macro function the will loop over the attributes we want to check and creates plots */
/*ods excel file='S:\workshop\output\report.xlsx';*/
ods graphics on;
%macro outlier_report(namer);
%do i= 1 %to 6;
/*creating a pivot table for each car attribute - outlier/non-outluer/total population precentage */
proc sql outobs=10;
create table agg_by_category_&i as
select
%scan(&namer,&i),
count(*) / total_count format percent10.2 as tot_pop,
sum(case when is_outlier_2018 = 'non-outlier' then 1 end) / count_non_outlier_2018 format percent10.2 as non_outlier,
sum(case when is_outlier_2018 = 'outlier' then 1 end) / count_outlier_2018 format percent10.2 as outlier_2018,
sum(case when is_outlier_2019 = 'outlier' then 1 end) / count_outlier_2019 format percent10.2 as outlier_2019,
sum(case when is_outlier_2020 = 'outlier' then 1 end) / count_outlier_2020 format percent10.2 as outlier_2020
from fp.km_outliers
where %scan(&namer,&i) is not missing
group by %scan(&namer,&i), total_count, count_non_outlier_2018 ,count_outlier_2018 , count_outlier_2019 , count_outlier_2020
order by tot_pop desc;
quit;
ods layout start columns =2 rows =1;
/* plotting the pivot table */
ods region row= 1 column= 1;
proc sgplot data = agg_by_category_&i;
xaxis label= ' ';
hbar %scan(&namer,&i) / response= outlier_2018 ;
hbar %scan(&namer,&i) / response= non_outlier barwidth=0.5;
run;
/* adding the data set table */
proc print data= agg_by_category_&i (drop=outlier_2019 outlier_2020);
run;
proc sort data=agg_by_category_&i;
by %scan(&namer,&i);
run;
/* unpivot with transposing in order to plot attributes of outliers by year */
proc transpose data= agg_by_category_&i out= AGG_BY_CATEGORY_T_&i;
by %scan(&namer,&i);
var outlier_:;
run;
/* chnaging column name and types after transposition */
data AGG_BY_CATEGORY_T2_&i;
set AGG_BY_CATEGORY_T_&i;
time_col = input(substr(_NAME_ ,9,4), 5.);
percent_outliers= col1;
format percent_outliers percent7.3;
run;
/* plotting a line plot for each category of the attribute over the years */
ods region row= 1 column= 2;
proc sgplot data= AGG_BY_CATEGORY_T2_&i;
series x= time_col y=percent_outliers / group = %scan(&namer,&i) curvelabel ;
run;
/* adding the data set table */
proc print data= agg_by_category_&i (drop= non_outlier tot_pop);
run;
ods layout end;
%end;
%mend;
/*car attributes to check */
%let namer = delek_nm baalut_nm kvutzat_zihum tozeret_nm ramat_eivzur_betihuty tzeva_cd;
/*using the macro function */
%outlier_report (&namer);
The UNIVARIATE Procedure
Variable: avg_monthly_km
Moments | |||
---|---|---|---|
N | 423122 | Sum Weights | 423122 |
Mean | 1751.87522 | Sum Observations | 741256946 |
Std Deviation | 1683.21353 | Variance | 2833207.79 |
Skewness | 88.0465642 | Kurtosis | 17387.3711 |
Uncorrected SS | 2.49738E12 | Corrected SS | 1.19879E12 |
Coeff Variation | 96.0806749 | Std Error Mean | 2.58765522 |
Basic Statistical Measures | |||
---|---|---|---|
Location | Variability | ||
Mean | 1751.875 | Std Deviation | 1683 |
Median | 1540.171 | Variance | 2833208 |
Mode | 1000.000 | Range | 452524 |
Interquartile Range | 1307 |
Tests for Location: Mu0=0 | ||||
---|---|---|---|---|
Test | Statistic | p Value | ||
Student's t | t | 677.0126 | Pr > |t| | <.0001 |
Sign | M | 211561 | Pr >= |M| | <.0001 |
Signed Rank | S | 4.476E10 | Pr >= |S| | <.0001 |
Quantiles (Definition 5) | |
---|---|
Level | Quantile |
100% Max | 4.52524E+05 |
99% | 4.85296E+03 |
95% | 3.64460E+03 |
90% | 3.08863E+03 |
75% Q3 | 2.28696E+03 |
50% Median | 1.54017E+03 |
25% Q1 | 9.80176E+02 |
10% | 6.26872E+02 |
5% | 4.61333E+02 |
1% | 2.36412E+02 |
0% Min | 2.12766E-02 |
Extreme Observations | |||
---|---|---|---|
Lowest | Highest | ||
Value | Obs | Value | Obs |
0.0212766 | 378291 | 195000 | 503137 |
0.0212766 | 350053 | 213900 | 396266 |
0.0212766 | 219431 | 257210 | 321100 |
0.0212766 | 187786 | 284120 | 556136 |
0.0217391 | 305461 | 452524 | 144669 |
Missing Values | |||
---|---|---|---|
Missing Value |
Count | Percent Of | |
All Obs | Missing Obs | ||
. | 309142 | 42.22 | 100.00 |
|
|
|
|
|
|
|
|
|
|
|
|
looking at the quantiles table we can deduce that values higher than 3000 km/month can be conseidered as outliers
we will combine and than aggregate by count of each type (P, R, L, D and C) , the unique number of types and total ownerships count for each car
than we will try to find the outliers in the aggregation table
%%SAS
libname fp "/home/u61485743";
*****************************************************************************;
********************************* assignment 9 ***********************************;
*****************************************************************************;
data fp.dim_baalut_hist;
infile "S:\workshop\MIA\mia_baalut_hist.csv"
delimiter=',' firstobs=2 ;
length baalut_cd $ 1;
input mispar_rechev $ baalut_start_dt : YYMMN. baalut_cd $ ;
format baalut_start_dt date9.;
run;
proc contents date=fp.dim_baalut_hist;
run;
*****************************************************************************;
********************************* assignment 10 ***********************************;
*****************************************************************************;
/* discalimer:
/* final calculation of AVG baalut months does not take into considiration
/* the last transaction of a mispar_rechev in the case baaulut equals 'D'
/* since there is no way to estimate correctly last period of baalut in that case */
/* which is still on-going (to my understanding)*/
proc sort data= FP.DIM_BAALUT_HIST;
by mispar_rechev descending baalut_start_dt;
run;
/* add a "lead" column to date of baalut change
/* and calculate diff in months by using lag in desc order */
data FP.DIM_BAALUT_HIST_LAG;
set FP.DIM_BAALUT_HIST;
by mispar_rechev;
next_change_dt = lag1(baalut_start_dt);
if first.mispar_rechev then next_change_dt=. ;
format next_change_dt date9. ;
month_gap= intck('month', baalut_start_dt ,next_change_dt);
run;
/* subquery-> 2 aggregations in the car granularity: */
/* first: count rows where "D" value, gives 1 if larger than 0
/* second : count months in the dealer state per car */
/* outer query-> 2 aggregation in the yazran granularity :
/* avg on months in the dealer state
/* and count cars that had "D" state */
proc sql;
create table FP.FACT_CARS_REG_FLAT2 as
select tozeret_nm,
count(mispar_rechev) as count_cars_dealer_state,
avg(sum_months_in_D_state) foramt 6.2 as avg_months_in_Dealer_state
from
(
/* proc sql inobs=100;*/
select h.mispar_rechev, tozeret_nm,
case when (sum(case when h.baalut_cd = 'D' then 1 end )) >= 1 then 1 end as is_d,
SUM(case when h.baalut_cd = 'D' then month_gap end) as sum_months_in_D_state
from FP.DIM_BAALUT_HIST_LAG as h
left join FP.FACT_CARS_REG_FLAT as f
on f.mispar_rechev = h.mispar_rechev
group by h.mispar_rechev, tozeret_nm
)a
group by tozeret_nm
order by count_cars_dealer_state desc;
quit;
proc print data= FP.FACT_CARS_REG_FLAT2;
run;
D:\anaconda3\lib\site-packages\saspy\sasioiom.py:1011: UserWarning: Noticed 'ERROR:' in LOG, you ought to take a look and see if there was a problem warnings.warn("Noticed 'ERROR:' in LOG, you ought to take a look and see if there was a problem")
11 The SAS System Monday, June 20, 2022 04:41:00 PM
ERROR: Physical file does not exist, /pbr/biconfig/940/Lev1/SASApp/S:\workshop\MIA\mia_baalut_hist.csv.
WARNING: The data set FP.DIM_BAALUT_HIST may be incomplete. When this step was stopped there were 0 observations and 3 variables.
222 proc contents date=fp.dim_baalut_hist;
____
1
WARNING 1-322: Assuming the symbol DATA was misspelled as date.
12 The SAS System Monday, June 20, 2022 04:41:00 PM
E3969440A681A2408885998500000006
The CONTENTS Procedure
Data Set Name | FP.DIM_BAALUT_HIST | Observations | 0 |
---|---|---|---|
Member Type | DATA | Variables | 3 |
Engine | V9 | Indexes | 0 |
Created | 06/20/2022 18:28:10 | Observation Length | 24 |
Last Modified | 06/20/2022 18:28:10 | Deleted Observations | 0 |
Protection | Compressed | NO | |
Data Set Type | Sorted | NO | |
Label | |||
Data Representation | SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64 | ||
Encoding | utf-8 Unicode (UTF-8) |
Engine/Host Dependent Information | |
---|---|
Data Set Page Size | 131072 |
Number of Data Set Pages | 1 |
First Data Page | 1 |
Max Obs per Page | 5431 |
Obs in First Data Page | 0 |
Number of Data Set Repairs | 0 |
Filename | /home/u61485743/dim_baalut_hist.sas7bdat |
Release Created | 9.0401M6 |
Host Created | Linux |
Inode Number | 90309949 |
Access Permission | rw-r--r-- |
Owner Name | u61485743 |
File Size | 256KB |
File Size (bytes) | 262144 |
Alphabetic List of Variables and Attributes | ||||
---|---|---|---|---|
# | Variable | Type | Len | Format |
1 | baalut_cd | Char | 1 | |
3 | baalut_start_dt | Num | 8 | DATE9. |
2 | mispar_rechev | Char | 8 |
2 methods to finding outliers :
with proc freq finding the 99th precentile. all values that are higher will be considerwd outliers
proc robustreg with mcd algorithm. calculates distance from the center of the data. higher distance means more extreme outlier. enables us to rank by the uniqueness of the observation.
*****************************************************************************;
********************************* assignment 11 ***********************************;
*****************************************************************************;
proc sql;
create table multi_baalut as
select mispar_Rechev,
count (distinct baalut_cd) as num_unique_baalut,
sum (case when baalut_cd = 'P' then 1 end) as count_P,
sum (case when baalut_cd = 'C' then 1 end) as count_C,
sum (case when baalut_cd = 'D' then 1 end) as count_D,
sum (case when baalut_cd = 'L' then 1 end) as count_L ,
sum (case when baalut_cd = 'R' then 1 end)as count_R ,
count(*) as count_num_baalut
from
FP.DIM_BAALUT_HIST
group by mispar_Rechev;
quit;
/* filter and count cars with only 3+ baalut type*/
proc sql;
create table fp.count_3_plus_baalut as
select count(*) as num_cars_3_plus_baalut
from
(select*
from multi_baalut
where num_unique_baalut >= 3) a
;
quit;
proc print data=fp.count_3_plus_baalut;
run;
/* first method for finding outliers : computing precentiles with proc means */
/* create a stats table to find limit for extreme values -> 99th precentile */
proc means data= multi_baalut n mean stddev p75 p95 p99 nmiss max STACKODS ;
var _numeric_;
/*output out= means_baalut p99= /autoname;*/
ods select all;
ods output fp.means_baalut= _all_;
run;
/* all the cars that have a higher stat than the 99th precentile's value
in any of the stats, should be considered as ouliers or "suspicous" */
proc sql;
create table fp.ouliers_baaulut1 as
select *
from multi_baalut
where count_p >3 or
count_C >2 or
count_D >4 or
count_L >1 or
count_R >1 or
num_unique_baalut>3 or
count_num_baalut > 5;
quit;
/* second method for finding outliers : robust regression with MCD algorithm */
/* robust regression finds the center of the data and calculstes
distances for each observation with the multi covariance detrerminant .
extreme outliers will have the largest distance */
/* first step: fill missing valuees with 0 */
proc sql;
create table multi_baalut2 as
select mispar_Rechev,
count (distinct baalut_cd) as num_unique_baalut,
sum (case when baalut_cd = 'P' then 1 else 0 end) as count_P,
sum (case when baalut_cd = 'C' then 1 else 0 end) as count_C,
sum (case when baalut_cd = 'D' then 1 else 0 end) as count_D,
sum (case when baalut_cd = 'L' then 1 else 0 end) as count_L ,
sum (case when baalut_cd = 'R' then 1 else 0 end)as count_R ,
count(*) as count_num_baalut
from
FP.DIM_BAALUT_HIST
group by mispar_Rechev;
quit;
/* step 2 : adding y column . y is a normal distribution random value so should not effect computation */
data multi_baalut2;
set multi_baalut2;
y= rand('normal', -1, 1);
retain indexer 0;
indexer = indexer +1;
run;
/* step 3 : using proc robustreg with mcd output */
/* leverage = X space oulier */
/*/ods exclude all;*/
proc robustreg data= multi_baalut2 method = M seed=12345;
model y=count_p count_c count_d count_l count_r
count_num_baalut num_unique_baalut/ diagnostics leverage(MCDInfo);
ods select MCDenter MCDcov Diagnostics;
ods output diagnostics= Diagnostics(where=(leverage=1));
run;
/*performance threads;*/
proc sort data= diagnostics out= diagnostics;
by descending PRobustDist;
run;
/* step 4: combine results for outliers with initial table */
proc sql ;
create table fp.multi_baalut_mdc (drop= y indexer) as
select m.* , d.PRobustDist
from multi_baalut2 m
left join diagnostics d
on m.indexer-2= d.obs
order by d.PRobustDist desc;
quit;
%%SAS
proc sql;
select * from fp.import;
quit;
mispar_rechev | num_unique | count_P | count_C | count_D | count_L | count_R | count_num | Projected Robust MCD Distance |
---|---|---|---|---|---|---|---|---|
55085701 | 3 | 6 | 2 | 7 | 0 | 0 | 15 | 348.3 |
59640401 | 2 | 8 | 0 | 6 | 0 | 0 | 14 | 331 |
36626101 | 2 | 9 | 0 | 3 | 0 | 0 | 12 | 324.2 |
74011401 | 2 | 6 | 0 | 9 | 0 | 0 | 15 | 320.7 |
38965301 | 2 | 8 | 0 | 5 | 0 | 0 | 13 | 314 |
39654401 | 2 | 8 | 0 | 5 | 0 | 0 | 13 | 314 |
30602101 | 2 | 7 | 0 | 7 | 0 | 0 | 14 | 312.8 |
49552501 | 3 | 8 | 1 | 4 | 0 | 0 | 13 | 306.6 |
86113301 | 4 | 2 | 2 | 10 | 1 | 0 | 15 | 303.2 |
68247101 | 4 | 5 | 1 | 9 | 1 | 0 | 16 | 295 |
17793201 | 3 | 8 | 1 | 3 | 0 | 0 | 12 | 293.5 |
36846101 | 2 | 5 | 0 | 9 | 0 | 0 | 14 | 290.8 |
33462601 | 3 | 4 | 2 | 7 | 0 | 0 | 13 | 290.7 |
55136601 | 2 | 4 | 0 | 10 | 0 | 0 | 14 | 288.1 |
45174101 | 3 | 2 | 3 | 1 | 0 | 0 | 6 | 283.7 |
20650501 | 3 | 2 | 3 | 1 | 0 | 0 | 6 | 283.7 |
19397601 | 4 | 2 | 3 | 1 | 1 | 0 | 7 | 283.7 |
38766501 | 2 | 8 | 0 | 3 | 0 | 0 | 11 | 283.7 |
37124601 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
56085901 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
58488901 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
21402001 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
64305901 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
68297201 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
63029401 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
74546101 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
80893401 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
86126001 | 1 | 0 | 3 | 0 | 0 | 0 | 3 | 283.4 |
19402201 | 2 | 0 | 3 | 0 | 1 | 0 | 4 | 283.4 |
50168901 | 2 | 0 | 3 | 0 | 1 | 0 | 4 | 283.4 |
34380901 | 2 | 1 | 3 | 0 | 0 | 0 | 4 | 283.3 |
23862002 | 2 | 1 | 3 | 0 | 0 | 0 | 4 | 283.3 |
19472301 | 2 | 0 | 3 | 3 | 0 | 0 | 6 | 279.2 |
20649001 | 2 | 0 | 3 | 3 | 0 | 0 | 6 | 279.2 |
58890401 | 2 | 0 | 3 | 3 | 0 | 0 | 6 | 279.2 |
37474201 | 2 | 7 | 0 | 5 | 0 | 0 | 12 | 275.8 |
38994801 | 2 | 7 | 0 | 5 | 0 | 0 | 12 | 275.8 |
19634301 | 3 | 7 | 0 | 5 | 1 | 0 | 13 | 275.8 |
21402201 | 2 | 0 | 3 | 2 | 0 | 0 | 5 | 275.4 |
36585501 | 2 | 0 | 3 | 2 | 0 | 0 | 5 | 275.4 |
76505001 | 3 | 0 | 3 | 2 | 1 | 0 | 6 | 275.4 |
77828501 | 2 | 0 | 3 | 1 | 0 | 0 | 4 | 273.9 |
63875001 | 2 | 0 | 3 | 1 | 0 | 0 | 4 | 273.9 |
68237101 | 2 | 0 | 3 | 1 | 0 | 0 | 4 | 273.9 |
24528601 | 2 | 0 | 3 | 1 | 0 | 0 | 4 | 273.9 |
20520101 | 2 | 0 | 3 | 1 | 0 | 0 | 4 | 273.9 |
20766201 | 3 | 0 | 3 | 1 | 1 | 0 | 5 | 273.9 |
61778501 | 3 | 0 | 3 | 1 | 1 | 0 | 5 | 273.9 |
57090101 | 3 | 0 | 3 | 1 | 1 | 0 | 5 | 273.9 |
43739901 | 3 | 1 | 3 | 1 | 0 | 0 | 5 | 273.9 |
30212501 | 2 | 8 | 0 | 2 | 0 | 0 | 10 | 270.7 |
85231901 | 2 | 8 | 0 | 2 | 0 | 0 | 10 | 270.7 |
74177701 | 2 | 5 | 0 | 8 | 0 | 0 | 13 | 268.2 |
17766101 | 2 | 5 | 0 | 8 | 0 | 0 | 13 | 268.2 |
17079201 | 2 | 7 | 1 | 0 | 0 | 0 | 8 | 264.7 |
30698101 | 3 | 5 | 2 | 3 | 0 | 0 | 10 | 264.5 |
21139901 | 4 | 5 | 2 | 3 | 1 | 0 | 11 | 264.5 |
22208101 | 2 | 4 | 0 | 9 | 0 | 0 | 13 | 264.1 |
30447101 | 2 | 4 | 0 | 9 | 0 | 0 | 13 | 264.1 |
29940601 | 2 | 8 | 0 | 1 | 0 | 0 | 9 | 259.5 |
50004001 | 2 | 7 | 0 | 4 | 0 | 0 | 11 | 258.9 |
63227301 | 2 | 6 | 0 | 6 | 0 | 0 | 12 | 258.1 |
21297801 | 4 | 7 | 1 | 3 | 1 | 0 | 12 | 255.1 |
24086601 | 3 | 5 | 1 | 7 | 0 | 0 | 13 | 253.4 |
49593301 | 3 | 3 | 2 | 6 | 0 | 0 | 11 | 251 |
12401401 | 1 | 7 | 0 | 0 | 0 | 0 | 7 | 250.3 |
66451701 | 3 | 4 | 2 | 4 | 0 | 0 | 10 | 247.5 |
22215801 | 2 | 5 | 0 | 7 | 0 | 0 | 12 | 246 |
30515101 | 2 | 5 | 0 | 7 | 0 | 0 | 12 | 246 |
49230901 | 2 | 7 | 0 | 3 | 0 | 0 | 10 | 243.5 |
59428201 | 2 | 7 | 0 | 3 | 0 | 0 | 10 | 243.5 |
33891701 | 2 | 7 | 0 | 3 | 0 | 0 | 10 | 243.5 |
74137301 | 2 | 7 | 0 | 3 | 0 | 0 | 10 | 243.5 |
50089201 | 3 | 7 | 0 | 3 | 1 | 0 | 11 | 243.5 |
22105301 | 3 | 7 | 0 | 3 | 1 | 0 | 11 | 243.5 |
33367902 | 2 | 4 | 0 | 8 | 0 | 0 | 12 | 240.4 |
39107501 | 2 | 4 | 0 | 8 | 0 | 0 | 12 | 240.4 |
16173401 | 3 | 4 | 0 | 8 | 1 | 0 | 13 | 240.2 |
11680001 | 2 | 6 | 0 | 5 | 0 | 0 | 11 | 238.8 |
39143701 | 2 | 6 | 0 | 5 | 0 | 0 | 11 | 238.8 |
47200601 | 2 | 6 | 0 | 5 | 0 | 0 | 11 | 238.8 |
38671301 | 3 | 6 | 0 | 5 | 1 | 0 | 12 | 238.7 |
63401701 | 3 | 4 | 2 | 3 | 0 | 0 | 9 | 236.6 |
17786501 | 3 | 3 | 2 | 5 | 0 | 0 | 10 | 236.5 |
20142901 | 4 | 7 | 1 | 1 | 0 | 1 | 10 | 233.3 |
81127001 | 4 | 2 | 1 | 9 | 1 | 0 | 13 | 231.2 |
38770001 | 2 | 7 | 0 | 2 | 0 | 0 | 9 | 229.7 |
45300701 | 2 | 7 | 0 | 2 | 0 | 0 | 9 | 229.7 |
59953201 | 2 | 7 | 0 | 2 | 0 | 0 | 9 | 229.7 |
46570501 | 3 | 2 | 0 | 9 | 1 | 0 | 12 | 224.7 |
59444001 | 2 | 5 | 0 | 6 | 0 | 0 | 11 | 224.4 |
59723201 | 2 | 5 | 0 | 6 | 0 | 0 | 11 | 224.4 |
55571201 | 2 | 5 | 0 | 6 | 0 | 0 | 11 | 224.4 |
55556801 | 2 | 5 | 0 | 6 | 0 | 0 | 11 | 224.4 |
20372301 | 2 | 5 | 0 | 6 | 0 | 0 | 11 | 224.4 |
12475101 | 2 | 5 | 0 | 6 | 0 | 0 | 11 | 224.4 |
74668101 | 2 | 5 | 0 | 6 | 0 | 0 | 11 | 224.4 |
29993701 | 3 | 5 | 0 | 6 | 1 | 0 | 12 | 224.3 |
50056601 | 3 | 5 | 0 | 6 | 1 | 0 | 12 | 224.3 |
37453201 | 3 | 5 | 0 | 6 | 1 | 0 | 12 | 224.3 |