import pandas as pd
import numpy as np
import os, glob, csv, time
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# import matplotlib.pyplot as plt
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',200)
pd.options.plotting.backend = "plotly"


h1b_files_lst = glob.glob('*.csv')
h1b_files_lst

['h1b_datahubexport-2015.csv',
 'h1b_datahubexport-2014.csv',
 'h1b_datahubexport-2016.csv',
 'h1b_datahubexport-2017.csv',
 'h1b_datahubexport-2013.csv',
 'h1b_datahubexport-2012.csv',
 'h1b_datahubexport-2010.csv',
 'h1b_datahubexport-2011.csv',
 'h1b_datahubexport-2020.csv',
 'h1b_datahubexport-2021.csv',
 'h1b_datahubexport-2009.csv',
 'h1b_datahubexport-2023.csv',
 'h1b_datahubexport-2022.csv',
 'h1b_datahubexport-2019.csv',
 'h1b_datahubexport-2018.csv']


year = '2023'
fnl_df = pd.DataFrame()
for fl in h1b_files_lst:
    if year in fl:
        df = pd.read_csv(fl)
        
        # Convert all 'NAN' values to None in the Employer field
        df2 = df[df.Employer.isna()].reset_index(drop=True)
        df1 = df[~df.Employer.isna()].reset_index(drop=True)
        df1['Employer'] = df1['Employer'].apply(lambda x: 'z '+ x if x[0].isdigit() else x)
        df1['Employer'] = df1['Employer'].apply(lambda x: x.replace("'",""))
        df = pd.concat([df2, df1])
        
        # Handling Nulls on the whole dataset
        df = df.where(pd.notnull(df), None)
        df = df.replace(np.nan, None)
        fnl_df = pd.concat([fnl_df,df])
fnl_df = fnl_df.reset_index(drop=True)
fnl_df


col_nms=[]
for i in fnl_df.columns.tolist():
    col_nms.append('_'.join(''.join(i.lower()).split()))
fnl_df.columns=col_nms
fnl_df['initial_total'] = fnl_df['initial_approval']+fnl_df['initial_denial']
fnl_df['continuing_total'] = fnl_df['continuing_approval']+fnl_df['continuing_denial']
fnl_df['total_applied'] = fnl_df['initial_total'] + fnl_df['continuing_total']
fnl_df['total_approval'] = fnl_df['initial_approval'] + fnl_df['continuing_approval']
fnl_df['total_denial'] = fnl_df['initial_denial'] + fnl_df['continuing_denial']
fnl_df['initial_approval_rate'] = fnl_df['initial_approval']/fnl_df['initial_total']
fnl_df['initial_denial_rate'] = fnl_df['initial_denial']/fnl_df['initial_total']
fnl_df['continuing_approval_rate'] = fnl_df['continuing_approval']/fnl_df['continuing_total']
fnl_df['continuing_denial_rate'] = fnl_df['continuing_denial']/fnl_df['continuing_total']
fnl_df['approval_rate'] = fnl_df['total_approval']/fnl_df['total_applied']
fnl_df['denial_rate'] = fnl_df['total_denial']/fnl_df['total_applied']
fnl_df = fnl_df.replace(np.nan, None)
fnl_df = fnl_df.replace(np.inf, None)
fnl_df = fnl_df.where(pd.notnull(fnl_df), None)
fnl_df


fnl_df.sort_values('total_approval', ascending=False).head(20).reset_index(drop=True)


# plotly.graph_objects
top_20 = fnl_df[fnl_df.total_applied>0].sort_values('total_approval', ascending=False).head(20)
colors = px.colors.qualitative.Plotly
fig = go.Figure()
fig.add_traces(go.Scatter(x=top_20['employer'], y = top_20['total_approval'], mode = 'lines', line=dict(color=colors[0])))
fig.add_traces(go.Scatter(x=top_20['employer'], y = top_20['total_denial'], mode = 'lines', line=dict(color=colors[1])))
fig.show()


num_petitions = 10
approval_counts_by_state = pd.DataFrame(fnl_df[fnl_df.total_applied<=num_petitions].groupby(['fiscal_year','state'])['total_approval'].sum()).reset_index().sort_values(['fiscal_year','total_approval'], ascending=[False,False]).reset_index(drop=True)
approval_counts_by_state = approval_counts_by_state.pivot(index='state', columns='fiscal_year', values='total_approval').fillna(0)
fig = approval_counts_by_state.plot(title=f"H1B Approvals Among Companies that applied less than {num_petitions} petitions in ", template="simple_white",
              labels=dict(index="state", value="total_approval", variable="Fiscal Year"))
fig.show()


pd.DataFrame(fnl_df[fnl_df.total_applied<=num_petitions].groupby(['fiscal_year','state'])['total_approval'].sum()).reset_index().sort_values(['fiscal_year','total_approval'], ascending=[False,False]).reset_index(drop=True)


num_petitions = 10
denial_counts_by_state = pd.DataFrame(fnl_df[fnl_df.total_applied<=num_petitions].groupby(['fiscal_year','state'])['total_denial'].sum()).reset_index().sort_values(['fiscal_year','total_denial'], ascending=[False,False]).reset_index(drop=True)
denial_counts_by_state = denial_counts_by_state.pivot(index='state', columns='fiscal_year', values='total_denial').fillna(0)
fig = denial_counts_by_state.plot(title=f"H1B Denials Among Companies that applied less than {num_petitions} petitions in 2023", template="simple_white",
              labels=dict(index="state", value="total_denial", variable="Fiscal Year"))
fig.show()

	Fiscal Year	Employer	Initial Approval	Initial Denial	Continuing Approval	Continuing Denial	NAICS	Tax ID	State	City	ZIP
0	2023	None	1	0	0	0	51	8070.0	DE	WILMINGTON	19801.0
1	2023	z 0965688 BC LTD DBA PROCOGIA	0	0	1	0	51	209.0	WA	SEATTLE	98101.0
2	2023	z 1 800 CONTACTS INC	0	0	1	0	42	1643.0	WA	SEATTLE	98101.0
3	2023	z 1 800 CONTACTS INC	0	0	1	0	42	1643.0	UT	DRAPER	84020.0
4	2023	z 1 800 FLOWERS COM INC	0	0	2	0	45	7311.0	NY	JERICHO	11753.0
...	...	...	...	...	...	...	...	...	...	...	...
33327	2023	ZYMEBALANZ LLC	2	0	0	0	51	927.0	CA	SAN CARLOS	94070.0
33328	2023	ZYMO SOLUTIONS LLC	2	0	1	0	54	3743.0	VA	HERNDON	20170.0
33329	2023	ZYNGA INC	1	0	2	0	51	3483.0	CA	SAN FRANCISCO	94103.0
33330	2023	ZYNGA INC	0	0	6	0	51	3483.0	CA	SAN MATEO	94403.0
33331	2023	ZYNO MEDICAL LLC	0	0	1	1	33	7390.0	MA	NATICK	1760.0

	fiscal_year	employer	initial_approval	initial_denial	continuing_approval	continuing_denial	naics	tax_id	state	city	zip	initial_total	continuing_total	total_applied	total_approval	total_denial	initial_approval_rate	initial_denial_rate	continuing_approval_rate	continuing_denial_rate	approval_rate	denial_rate
0	2023	None	1	0	0	0	51	8070.0	DE	WILMINGTON	19801.0	1	0	1	1	0	1.0	0.0	NaN	NaN	1.0	0.0
1	2023	z 0965688 BC LTD DBA PROCOGIA	0	0	1	0	51	209.0	WA	SEATTLE	98101.0	0	1	1	1	0	1.0	0.0	1.0	0.0	1.0	0.0
2	2023	z 1 800 CONTACTS INC	0	0	1	0	42	1643.0	WA	SEATTLE	98101.0	0	1	1	1	0	1.0	0.0	1.0	0.0	1.0	0.0
3	2023	z 1 800 CONTACTS INC	0	0	1	0	42	1643.0	UT	DRAPER	84020.0	0	1	1	1	0	1.0	0.0	1.0	0.0	1.0	0.0
4	2023	z 1 800 FLOWERS COM INC	0	0	2	0	45	7311.0	NY	JERICHO	11753.0	0	2	2	2	0	1.0	0.0	1.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
33327	2023	ZYMEBALANZ LLC	2	0	0	0	51	927.0	CA	SAN CARLOS	94070.0	2	0	2	2	0	1.0	0.0	1.0	0.0	1.0	0.0
33328	2023	ZYMO SOLUTIONS LLC	2	0	1	0	54	3743.0	VA	HERNDON	20170.0	2	1	3	3	0	1.0	0.0	1.0	0.0	1.0	0.0
33329	2023	ZYNGA INC	1	0	2	0	51	3483.0	CA	SAN FRANCISCO	94103.0	1	2	3	3	0	1.0	0.0	1.0	0.0	1.0	0.0
33330	2023	ZYNGA INC	0	0	6	0	51	3483.0	CA	SAN MATEO	94403.0	0	6	6	6	0	1.0	0.0	1.0	0.0	1.0	0.0
33331	2023	ZYNO MEDICAL LLC	0	0	1	1	33	7390.0	MA	NATICK	1760.0	0	2	2	1	1	1.0	0.0	0.5	0.5	0.5	0.5

	fiscal_year	employer	initial_approval	initial_denial	continuing_approval	continuing_denial	naics	tax_id	state	city	zip	initial_total	continuing_total	total_applied	total_approval	total_denial	initial_approval_rate	initial_denial_rate	continuing_approval_rate	continuing_denial_rate	approval_rate	denial_rate
0	2023	AMAZON.COM SERVICES LLC	248	14	4120	110	45	4687.0	VA	ARLINGTON	22202.0	262	4230	4492	4368	124	0.946565	0.053435	0.973995	0.026005	0.972395	0.027605
1	2023	TATA CONSULTANCY SVCS LTD	676	39	2989	149	54	9806.0	MD	ROCKVILLE	20850.0	715	3138	3853	3665	188	0.945455	0.054545	0.952518	0.047482	0.951207	0.048793
2	2023	COGNIZANT TECHNOLOGY SOLUTIONS US CORP	944	15	2450	114	54	4155.0	TX	COLLEGE STATION	77845.0	959	2564	3523	3394	129	0.984359	0.015641	0.955538	0.044462	0.963383	0.036617
3	2023	INFOSYS LIMITED	728	37	2553	45	54	235.0	TX	RICHARDSON	75082.0	765	2598	3363	3281	82	0.951634	0.048366	0.982679	0.017321	0.975617	0.024383
4	2023	GOOGLE LLC	56	3	2271	29	54	3581.0	CA	MOUNTAIN VIEW	94043.0	59	2300	2359	2327	32	0.949153	0.050847	0.987391	0.012609	0.986435	0.013565
5	2023	MICROSOFT CORPORATION	39	0	1950	45	51	4442.0	WA	REDMOND	98052.0	39	1995	2034	1989	45	1.000000	0.000000	0.977444	0.022556	0.977876	0.022124
6	2023	APPLE INC	47	3	1725	22	33	4110.0	CA	CUPERTINO	95014.0	50	1747	1797	1772	25	0.940000	0.060000	0.987407	0.012593	0.986088	0.013912
7	2023	JPMORGAN CHASE CO	58	1	1429	22	55	4428.0	IL	CHICAGO	60603.0	59	1451	1510	1487	23	0.983051	0.016949	0.984838	0.015162	0.984768	0.015232
8	2023	META PLATFORMS INC	36	2	1435	11	51	5019.0	CA	MENLO PARK	94025.0	38	1446	1484	1471	13	0.947368	0.052632	0.992393	0.007607	0.991240	0.008760
9	2023	DELOITTE CONSULTING LLP	63	3	1363	33	54	4513.0	PA	PHILADELPHIA	19103.0	66	1396	1462	1426	36	0.954545	0.045455	0.976361	0.023639	0.975376	0.024624
10	2023	HCL AMERICA INC	313	25	1063	17	54	5035.0	CA	SUNNYVALE	94085.0	338	1080	1418	1376	42	0.926036	0.073964	0.984259	0.015741	0.970381	0.029619
11	2023	CAPGEMINI AMERICA INC	568	12	785	11	54	5929.0	IL	CHICAGO	60606.0	580	796	1376	1353	23	0.979310	0.020690	0.986181	0.013819	0.983285	0.016715
12	2023	WAL MART ASSOCIATES INC	49	2	1225	22	45	4409.0	AR	BENTONVILLE	72716.0	51	1247	1298	1274	24	0.960784	0.039216	0.982358	0.017642	0.981510	0.018490
13	2023	ERNST YOUNG US LLP	35	2	1135	30	54	5596.0	NJ	SECAUCUS	7094.0	37	1165	1202	1170	32	0.945946	0.054054	0.974249	0.025751	0.973378	0.026622
14	2023	INTEL CORPORATION	52	5	1023	64	33	2743.0	AZ	CHANDLER	85248.0	57	1087	1144	1075	69	0.912281	0.087719	0.941122	0.058878	0.939685	0.060315
15	2023	IBM CORPORATION	317	9	730	14	54	1985.0	NC	DURHAM	27709.0	326	744	1070	1047	23	0.972393	0.027607	0.981183	0.018817	0.978505	0.021495
16	2023	ORACLE AMERICA INC	6	0	908	4	51	5249.0	TX	AUSTIN	78741.0	6	912	918	914	4	1.000000	0.000000	0.995614	0.004386	0.995643	0.004357
17	2023	WIPRO LIMITED	177	25	694	26	54	4401.0	NJ	EAST BRUNSWICK	8816.0	202	720	922	871	51	0.876238	0.123762	0.963889	0.036111	0.944685	0.055315
18	2023	CISCO SYSTEMS INC	24	0	790	19	33	9951.0	CA	SAN JOSE	95134.0	24	809	833	814	19	1.000000	0.000000	0.976514	0.023486	0.977191	0.022809
19	2023	LTIMINDTREE LIMITED	225	1	585	7	54	4303.0	NJ	EDISON	8817.0	226	592	818	810	8	0.995575	0.004425	0.988176	0.011824	0.990220	0.009780

	fiscal_year	state	total_approval
0	2023	CA	9994
1	2023	TX	7118
2	2023	NY	5389
3	2023	NJ	4970
4	2023	IL	2864
5	2023	MA	2516
6	2023	VA	2394
7	2023	MI	2378
8	2023	GA	2210
9	2023	FL	1940
10	2023	PA	1793
11	2023	NC	1332
12	2023	OH	1284
13	2023	MD	1181
14	2023	WA	995
15	2023	MO	771
16	2023	MN	762
17	2023	AZ	707
18	2023	CT	673
19	2023	TN	662
20	2023	CO	626
21	2023	WI	480
22	2023	IN	474
23	2023	DC	422
24	2023	SC	409
25	2023	UT	377
26	2023	IA	343
27	2023	KS	319
28	2023	OR	312
29	2023	NE	293
30	2023	DE	291
31	2023	NV	244
32	2023	LA	238
33	2023	NH	229
34	2023	AR	218
35	2023	KY	203
36	2023	AL	200
37	2023	OK	182
38	2023	GU	159
39	2023	NM	131
40	2023	RI	127
41	2023	SD	107
42	2023	MS	93
43	2023	ID	88
44	2023	ME	80
45	2023	ND	73
46	2023	MT	57
47	2023	WV	56
48	2023	HI	46
49	2023	WY	42
50	2023	VT	35
51	2023	AK	24
52	2023	MP	22
53	2023	PR	19
54	2023	VI	9
55	2023	XX	5
56	2023	AE	0

H-1B Employer Data Hub¶

Import the H1B csv files into Pandas dataframe¶

Choose an year for analysis¶

High level analysis from above data¶

Top 20 firms that had the highest approvals¶

Approvals among firms that made less than 10 applications in each State¶

The above chart sorted by Total Approvals by State in descending order¶

Denials among firms that made less than 10 applications in each State¶