In [1]:
import matplotlib.pyplot as plt
import numpy as np

This code block makes up a dataset to work with¶

In [2]:
ndum=1000
x = 10*np.random.random(ndum)
y = x + np.random.normal(0,3,ndum)
# and lets put in some bad data values at y=-10
derp=np.random.random(ndum)
y=np.array([-10 if a<0.1 else b for a,b in zip(derp,y)])

# plot our dataset
plt.scatter(x,y,s=5)
plt.xlabel('x')
plt.ylabel('y')
Out[2]:
Text(0, 0.5, 'y')

Now let's do some "binned statistics" to look at the trend in detail.¶

In [3]:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binned_statistic.html
from scipy.stats import binned_statistic

# first identify good data
good = (y!=-10)


# set up bins in x going from 0 to 10 in steps of 1
binvals = np.arange(0,11,1) 
# work out how many points are in each bin (bin_num) as well as 
# the average and standard deviation of the y values in the bins (bin_avg, bin_std)
bin_num, bin_edges, bin_idx = binned_statistic(x[good], y[good], statistic='count', bins=binvals)
bin_avg, bin_edges, bin_idx = binned_statistic(x[good], y[good], statistic='mean', bins=binvals)
bin_std, bin_edges, bin_idx = binned_statistic(x[good], y[good], statistic='std',  bins=binvals)

# this bit of python magic works out the center of each bin
bin_cent = 0.5*(bin_edges[1:]+bin_edges[:-1])

# plot the data
plt.scatter(x,y,s=5)
plt.xlabel('x')
plt.ylabel('y')

# plot the mean and standard deviation as red points and errorbars
plt.errorbar(bin_cent, bin_avg, yerr=bin_std, fmt='o', color='red')
Out[3]:
<ErrorbarContainer object of 3 artists>
In [4]:
# make a linear fit using np.polyfit, and calculate uncertainty and scatter

coeff,cov=np.polyfit(x[good],y[good],1,cov=True)
coeff_err = np.sqrt(np.diag(cov))
print('    slope = {:.3f} +/- {:.3f}'.format(coeff[0],coeff_err[0]))
print('intercept = {:.3f} +/- {:.3f}'.format(coeff[1],coeff_err[1]))
polynomial=np.poly1d(coeff)
print('  scatter = {:.3f}'.format(np.std(y[good]-polynomial(x[good]))))



# plot the data
plt.scatter(x,y,s=5)
plt.xlabel('x')
plt.ylabel('y')

# overplot your fit
xfit=np.linspace(0,10,100)
plt.plot(xfit,polynomial(xfit),color='green',lw=3)
    slope = 1.001 +/- 0.035
intercept = -0.095 +/- 0.200
  scatter = 3.021
Out[4]:
[<matplotlib.lines.Line2D at 0x7f816ae42f70>]
In [ ]: