Material prepared for the 52nd Saas-Fee Advanced Course on "The circum-galactic medium across cosmic time: an observational and modelling challenge"¶

by Alejandro Benitez-Llambay & Michele Fumagalli¶

Contact for questions or comments: alejandro.benitezllambay@unimib.it

Fitting Gaussian profiles with Gaussian Mixtures¶

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from astropy.io import fits
from scipy.optimize import curve_fit
import glob

# We define a helper class to handle the datacube
class muse_file():
    def __init__(self, filename):
        current_cube = fits.open(filename)
        #flux in electron counts
        self.flux   = current_cube[0].data 
        #variance as reported by the intrument
        self.var    = current_cube[1].data 
        #header with useful information
        self.header = current_cube[0].header  
        #dimensions of the cube
        self.nz, self.ny, self.nx = self.flux.shape  

        #zero point for wavelength and space.
        self.l0 = self.header["CRVAL3"]
        self.x0 = self.header["CRVAL1"] 
        self.y0 = self.header["CRVAL3"]

        #Spatial sampling in arcsec
        self.dx = self.header['CD2_2'] * 60 * 60
        self.dy = self.header['CD2_2'] * 60 * 60

        #Wavelength sampling is done every 0.125 A
        self.dl = self.header["CD3_3"]

        self.l = self.l0 + np.arange(0,self.nz,1)*self.dl
        self.x = self.x0 + np.arange(0,self.nx,1)*self.dx
        self.y = self.y0 + np.arange(0,self.ny,1)*self.dy
In [49]:
input_file = "../MUSECubes/Lya/lae_cube8.fits"
cube = muse_file(input_file)

#Get the mean spectrum within the sampled spatial range:
spectrum_sum = np.nansum(cube.flux, axis=(1,2))

#uncertainty associated with stack
spectrum_var = np.sum(cube.var, axis=(1,2))
spectrum_sigma = np.sqrt(spectrum_var)

#Find the Lya line:
kmax = np.argmax(spectrum_sum)
lmax = cube.l[kmax]
fmax = spectrum_sum[kmax]

c = 3e5 #km /s
delta_v = c * (cube.l-lmax)/lmax

plt.errorbar(delta_v, spectrum_sum, yerr=spectrum_sigma, capsize=3, color='k')    
plt.xlabel(r'$\rm \Delta v / km \ s^{-1} $')
plt.ylabel('Flux / 10$^{-20}$ erg s$^{-1}$ cm$^{-2}$ $A^{-1}$')
plt.grid()

Bring the spectrum's flux to the [0,1] normalize so that it can be interpreted as a distribution function.

In [68]:
vmin = delta_v[0]
vmax = delta_v[-1]
dv   = delta_v[1]-delta_v[0]

positive_spectrum = (spectrum_sum - np.min(spectrum_sum))/(np.max(spectrum_sum)-np.min(spectrum_sum))
positive_spectrum = positive_spectrum / dv / np.sum(positive_spectrum)

plt.plot(delta_v, positive_spectrum, color='k', marker='o')    
plt.xlabel(r'$\rm \Delta v / km \ s^{-1} $')
plt.ylabel('f')
plt.grid()

Now use the rejection method to draw random numbers distributed according to $f$

In [171]:
# We are going to draw 30000 velocities distributed as dictated by the spectrum, 
# so that we can use the Gaussian Mixtures model included in sklearn.

N = int(3e4)
v = []

while(len(v)<N):
    vrand = vmin + (vmax-vmin) * np.random.rand(N)
    frand = np.random.rand(N)
    k, = np.where(frand < np.interp(vrand, delta_v, positive_spectrum))
    v.extend(vrand[k])
v = np.asarray(v)

Compare the obtained random distribution with the original spectrum

In [172]:
bins = np.linspace(delta_v[0], delta_v[-1], 200)
yhist, xhist = np.histogram(v, bins=bins)
vmed = 0.5*(xhist[1:]+xhist[:-1])
synthetic_spectrum = yhist / (xhist[1:]-xhist[:-1]) /np.sum(yhist)
plt.plot(delta_v, positive_spectrum, ls='-')
plt.plot(vmed, synthetic_spectrum, ls='--')

plt.xlabel(r'$\rm \Delta v / km \ s^{-1} $')
plt.ylabel('f')
plt.grid()

Now with this synthetic data, we can use Gaussian Mixtures to determine the parameters of N Gaussians that fit the data.

In [173]:
from sklearn.mixture import GaussianMixture

N = np.arange(1,7)
models = []
AICs = []
for i in N:
    model = GaussianMixture(i).fit(v.reshape(-1,1))
    models.append(model)
    AICs.append(model.aic(v.reshape(-1,1)))
    
#we pick the model that minimizes the AIC information criterion
best_model = models[np.argmin(AICs)]

#The total probability over all components
logprob = best_model.score_samples(vmed.reshape(-1,1))
pdf = np.exp(logprob)
plt.plot(vmed, pdf, color='k')
plt.plot(delta_v, positive_spectrum, ls='--')

#Now for the individual components
weight = best_model.predict_proba(vmed.reshape(-1,1))
for i in range(len(N)):
    plt.plot(vmed, weight[:,i]*pdf, 'k', alpha=0.6)
    

plt.xlabel(r'$\rm \Delta v / km \ s^{-1} $')
plt.ylabel('f')
plt.grid()

Clearly, We need to be careful not to overfit the data. It is recommended to restrict the fitting range to the line, but I leave this to you :)