Using numba for GPU acceleration with large Excel file
up vote
0
down vote
favorite
I've minimal experience of using GPUs and after looking online, I came up with this:
import pandas as pd
from haversine import haversine
import numpy as np
from pandas import ExcelWriter
import numba as nb
np.set_printoptions(precision=20)
path = 'distance.xlsx'
df = pd.read_excel(path)
df = df.assign(Dist=pd.Series(np.zeros(27055)).values);
#df = df.assign(Facility=pd.Series(np.zeros(27055)).values);
df = df.assign(Facility=pd.Series(np.zeros((27055,),dtype='float,float')).values);
df["Facility_city"] = ""
#idx = np.asarray(df.loc[df["lat1"] != '.'].ix[:,0].index)
#temp1 = 1e10
#j = 0
idx = np.asarray(df[(df['lat1']!='.') & (df['state']== df['state'][0])].index)
@nb.jit(nopython=True)
def f(df):
temp1 = 1e10
j = 0
for i in range(0, len(df)):
if df['state'][i+1] != df['state'][i]:
idx = np.asarray(df[(df['lat1']!='.') & (df['state']== df['state'][i+1])].index)
#while (df.iloc[idx[j]]['state'] == df.iloc[i]['state']):
while (j!=len(idx)):
p1 = (df.iloc[idx[j]]['lat1'],df.iloc[idx[j]]['long1'])
p2 = (df.iloc[i]['lat2'],df.iloc[i]['long2'])
df.Dist.iloc[i] = min(temp1,haversine(p1, p2, miles=True))
if df.Dist.iloc[i] < temp1:
#df.Facility.iloc[i] = idx[j]
df.Facility.iloc[i] = (p1[0],p1[1])
df.Facility_city.iloc[i] = df.city.iloc[idx[j]]
temp1 = df.Dist.iloc[i]
j+=1
j = 0
temp1 = 1e10
return df
if __name__ == "__main__":
df = f(df)
writer = ExcelWriter('Results.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
The for loop
runs close to 30k times. Hence, my inclination to use a GPU (via Floydhub
) I'm struggling to configure my code for the same.
When I execute this, I get an error.
Traceback (most recent call last):
File "<ipython-input-8-1a4900ad9325>", line 1, in <module>
runfile('/Users/deepayanbhadra/Downloads/Chotka /Chotka.py', wdir='/Users/deepayanbhadra/Downloads/Chotka ')
File "/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/Users/deepayanbhadra/Downloads/Chotka /Chotka.py", line 60, in <module>
df = f(df)
File "/anaconda3/lib/python3.6/site-packages/numba/dispatcher.py", line 344, in _compile_for_args
reraise(type(e), e, None)
File "/anaconda3/lib/python3.6/site-packages/numba/six.py", line 658, in reraise
raise value.with_traceback(tb)
TypingError: cannot determine Numba type of <class 'function'>
How do I go about circumventing this? I think numba is the best framework from the references but any other (like PyCUDA) is also okay.
python for-loop gpu numba
add a comment |
up vote
0
down vote
favorite
I've minimal experience of using GPUs and after looking online, I came up with this:
import pandas as pd
from haversine import haversine
import numpy as np
from pandas import ExcelWriter
import numba as nb
np.set_printoptions(precision=20)
path = 'distance.xlsx'
df = pd.read_excel(path)
df = df.assign(Dist=pd.Series(np.zeros(27055)).values);
#df = df.assign(Facility=pd.Series(np.zeros(27055)).values);
df = df.assign(Facility=pd.Series(np.zeros((27055,),dtype='float,float')).values);
df["Facility_city"] = ""
#idx = np.asarray(df.loc[df["lat1"] != '.'].ix[:,0].index)
#temp1 = 1e10
#j = 0
idx = np.asarray(df[(df['lat1']!='.') & (df['state']== df['state'][0])].index)
@nb.jit(nopython=True)
def f(df):
temp1 = 1e10
j = 0
for i in range(0, len(df)):
if df['state'][i+1] != df['state'][i]:
idx = np.asarray(df[(df['lat1']!='.') & (df['state']== df['state'][i+1])].index)
#while (df.iloc[idx[j]]['state'] == df.iloc[i]['state']):
while (j!=len(idx)):
p1 = (df.iloc[idx[j]]['lat1'],df.iloc[idx[j]]['long1'])
p2 = (df.iloc[i]['lat2'],df.iloc[i]['long2'])
df.Dist.iloc[i] = min(temp1,haversine(p1, p2, miles=True))
if df.Dist.iloc[i] < temp1:
#df.Facility.iloc[i] = idx[j]
df.Facility.iloc[i] = (p1[0],p1[1])
df.Facility_city.iloc[i] = df.city.iloc[idx[j]]
temp1 = df.Dist.iloc[i]
j+=1
j = 0
temp1 = 1e10
return df
if __name__ == "__main__":
df = f(df)
writer = ExcelWriter('Results.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
The for loop
runs close to 30k times. Hence, my inclination to use a GPU (via Floydhub
) I'm struggling to configure my code for the same.
When I execute this, I get an error.
Traceback (most recent call last):
File "<ipython-input-8-1a4900ad9325>", line 1, in <module>
runfile('/Users/deepayanbhadra/Downloads/Chotka /Chotka.py', wdir='/Users/deepayanbhadra/Downloads/Chotka ')
File "/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/Users/deepayanbhadra/Downloads/Chotka /Chotka.py", line 60, in <module>
df = f(df)
File "/anaconda3/lib/python3.6/site-packages/numba/dispatcher.py", line 344, in _compile_for_args
reraise(type(e), e, None)
File "/anaconda3/lib/python3.6/site-packages/numba/six.py", line 658, in reraise
raise value.with_traceback(tb)
TypingError: cannot determine Numba type of <class 'function'>
How do I go about circumventing this? I think numba is the best framework from the references but any other (like PyCUDA) is also okay.
python for-loop gpu numba
pandas is not (yet) supported by numba. You need to extract thedf
values witharr = df.values
and then pass the array to numba. This means that you have to use numpy indexing (integers, lists, slices etc.) within the numba jitted function. Furthermore your code can easily be improved without numba. It looks like all of it can easily be replaced by vectorized approaches, yielding a performance nearly as good as numba but with the possibility to use pandas.
– Scotty1-
Nov 23 at 8:45
add a comment |
up vote
0
down vote
favorite
up vote
0
down vote
favorite
I've minimal experience of using GPUs and after looking online, I came up with this:
import pandas as pd
from haversine import haversine
import numpy as np
from pandas import ExcelWriter
import numba as nb
np.set_printoptions(precision=20)
path = 'distance.xlsx'
df = pd.read_excel(path)
df = df.assign(Dist=pd.Series(np.zeros(27055)).values);
#df = df.assign(Facility=pd.Series(np.zeros(27055)).values);
df = df.assign(Facility=pd.Series(np.zeros((27055,),dtype='float,float')).values);
df["Facility_city"] = ""
#idx = np.asarray(df.loc[df["lat1"] != '.'].ix[:,0].index)
#temp1 = 1e10
#j = 0
idx = np.asarray(df[(df['lat1']!='.') & (df['state']== df['state'][0])].index)
@nb.jit(nopython=True)
def f(df):
temp1 = 1e10
j = 0
for i in range(0, len(df)):
if df['state'][i+1] != df['state'][i]:
idx = np.asarray(df[(df['lat1']!='.') & (df['state']== df['state'][i+1])].index)
#while (df.iloc[idx[j]]['state'] == df.iloc[i]['state']):
while (j!=len(idx)):
p1 = (df.iloc[idx[j]]['lat1'],df.iloc[idx[j]]['long1'])
p2 = (df.iloc[i]['lat2'],df.iloc[i]['long2'])
df.Dist.iloc[i] = min(temp1,haversine(p1, p2, miles=True))
if df.Dist.iloc[i] < temp1:
#df.Facility.iloc[i] = idx[j]
df.Facility.iloc[i] = (p1[0],p1[1])
df.Facility_city.iloc[i] = df.city.iloc[idx[j]]
temp1 = df.Dist.iloc[i]
j+=1
j = 0
temp1 = 1e10
return df
if __name__ == "__main__":
df = f(df)
writer = ExcelWriter('Results.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
The for loop
runs close to 30k times. Hence, my inclination to use a GPU (via Floydhub
) I'm struggling to configure my code for the same.
When I execute this, I get an error.
Traceback (most recent call last):
File "<ipython-input-8-1a4900ad9325>", line 1, in <module>
runfile('/Users/deepayanbhadra/Downloads/Chotka /Chotka.py', wdir='/Users/deepayanbhadra/Downloads/Chotka ')
File "/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/Users/deepayanbhadra/Downloads/Chotka /Chotka.py", line 60, in <module>
df = f(df)
File "/anaconda3/lib/python3.6/site-packages/numba/dispatcher.py", line 344, in _compile_for_args
reraise(type(e), e, None)
File "/anaconda3/lib/python3.6/site-packages/numba/six.py", line 658, in reraise
raise value.with_traceback(tb)
TypingError: cannot determine Numba type of <class 'function'>
How do I go about circumventing this? I think numba is the best framework from the references but any other (like PyCUDA) is also okay.
python for-loop gpu numba
I've minimal experience of using GPUs and after looking online, I came up with this:
import pandas as pd
from haversine import haversine
import numpy as np
from pandas import ExcelWriter
import numba as nb
np.set_printoptions(precision=20)
path = 'distance.xlsx'
df = pd.read_excel(path)
df = df.assign(Dist=pd.Series(np.zeros(27055)).values);
#df = df.assign(Facility=pd.Series(np.zeros(27055)).values);
df = df.assign(Facility=pd.Series(np.zeros((27055,),dtype='float,float')).values);
df["Facility_city"] = ""
#idx = np.asarray(df.loc[df["lat1"] != '.'].ix[:,0].index)
#temp1 = 1e10
#j = 0
idx = np.asarray(df[(df['lat1']!='.') & (df['state']== df['state'][0])].index)
@nb.jit(nopython=True)
def f(df):
temp1 = 1e10
j = 0
for i in range(0, len(df)):
if df['state'][i+1] != df['state'][i]:
idx = np.asarray(df[(df['lat1']!='.') & (df['state']== df['state'][i+1])].index)
#while (df.iloc[idx[j]]['state'] == df.iloc[i]['state']):
while (j!=len(idx)):
p1 = (df.iloc[idx[j]]['lat1'],df.iloc[idx[j]]['long1'])
p2 = (df.iloc[i]['lat2'],df.iloc[i]['long2'])
df.Dist.iloc[i] = min(temp1,haversine(p1, p2, miles=True))
if df.Dist.iloc[i] < temp1:
#df.Facility.iloc[i] = idx[j]
df.Facility.iloc[i] = (p1[0],p1[1])
df.Facility_city.iloc[i] = df.city.iloc[idx[j]]
temp1 = df.Dist.iloc[i]
j+=1
j = 0
temp1 = 1e10
return df
if __name__ == "__main__":
df = f(df)
writer = ExcelWriter('Results.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
The for loop
runs close to 30k times. Hence, my inclination to use a GPU (via Floydhub
) I'm struggling to configure my code for the same.
When I execute this, I get an error.
Traceback (most recent call last):
File "<ipython-input-8-1a4900ad9325>", line 1, in <module>
runfile('/Users/deepayanbhadra/Downloads/Chotka /Chotka.py', wdir='/Users/deepayanbhadra/Downloads/Chotka ')
File "/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/Users/deepayanbhadra/Downloads/Chotka /Chotka.py", line 60, in <module>
df = f(df)
File "/anaconda3/lib/python3.6/site-packages/numba/dispatcher.py", line 344, in _compile_for_args
reraise(type(e), e, None)
File "/anaconda3/lib/python3.6/site-packages/numba/six.py", line 658, in reraise
raise value.with_traceback(tb)
TypingError: cannot determine Numba type of <class 'function'>
How do I go about circumventing this? I think numba is the best framework from the references but any other (like PyCUDA) is also okay.
python for-loop gpu numba
python for-loop gpu numba
asked Nov 21 at 18:16
db18
1417
1417
pandas is not (yet) supported by numba. You need to extract thedf
values witharr = df.values
and then pass the array to numba. This means that you have to use numpy indexing (integers, lists, slices etc.) within the numba jitted function. Furthermore your code can easily be improved without numba. It looks like all of it can easily be replaced by vectorized approaches, yielding a performance nearly as good as numba but with the possibility to use pandas.
– Scotty1-
Nov 23 at 8:45
add a comment |
pandas is not (yet) supported by numba. You need to extract thedf
values witharr = df.values
and then pass the array to numba. This means that you have to use numpy indexing (integers, lists, slices etc.) within the numba jitted function. Furthermore your code can easily be improved without numba. It looks like all of it can easily be replaced by vectorized approaches, yielding a performance nearly as good as numba but with the possibility to use pandas.
– Scotty1-
Nov 23 at 8:45
pandas is not (yet) supported by numba. You need to extract the
df
values with arr = df.values
and then pass the array to numba. This means that you have to use numpy indexing (integers, lists, slices etc.) within the numba jitted function. Furthermore your code can easily be improved without numba. It looks like all of it can easily be replaced by vectorized approaches, yielding a performance nearly as good as numba but with the possibility to use pandas.– Scotty1-
Nov 23 at 8:45
pandas is not (yet) supported by numba. You need to extract the
df
values with arr = df.values
and then pass the array to numba. This means that you have to use numpy indexing (integers, lists, slices etc.) within the numba jitted function. Furthermore your code can easily be improved without numba. It looks like all of it can easily be replaced by vectorized approaches, yielding a performance nearly as good as numba but with the possibility to use pandas.– Scotty1-
Nov 23 at 8:45
add a comment |
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53418277%2fusing-numba-for-gpu-acceleration-with-large-excel-file%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
pandas is not (yet) supported by numba. You need to extract the
df
values witharr = df.values
and then pass the array to numba. This means that you have to use numpy indexing (integers, lists, slices etc.) within the numba jitted function. Furthermore your code can easily be improved without numba. It looks like all of it can easily be replaced by vectorized approaches, yielding a performance nearly as good as numba but with the possibility to use pandas.– Scotty1-
Nov 23 at 8:45