import numpy as np
import matplotlib.pyplot as plt
from astropy.io import fits
from scipy.optimize import curve_fit
import glob
# We define a helper class to handle the datacube
class muse_file():
def __init__(self, filename):
current_cube = fits.open(filename)
#flux in electron counts
self.flux = current_cube[0].data
#variance as reported by the intrument
self.var = current_cube[1].data
#header with useful information
self.header = current_cube[0].header
#dimensions of the cube
self.nz, self.ny, self.nx = self.flux.shape
#zero point for wavelength and space.
self.l0 = self.header["CRVAL3"]
self.x0 = self.header["CRVAL1"]
self.y0 = self.header["CRVAL3"]
#Spatial sampling in arcsec
self.dx = self.header['CD2_2'] * 60 * 60
self.dy = self.header['CD2_2'] * 60 * 60
#Wavelength sampling is done every 0.125 A
self.dl = self.header["CD3_3"]
self.l = self.l0 + np.arange(0,self.nz,1)*self.dl
self.x = self.x0 + np.arange(0,self.nx,1)*self.dx
self.y = self.y0 + np.arange(0,self.ny,1)*self.dy
input_file = "../MUSECubes/Lya/lae_cube8.fits"
cube = muse_file(input_file)
#Get the mean spectrum within the sampled spatial range:
spectrum_sum = np.nansum(cube.flux, axis=(1,2))
#uncertainty associated with stack
spectrum_var = np.sum(cube.var, axis=(1,2))
spectrum_sigma = np.sqrt(spectrum_var)
#Find the Lya line:
kmax = np.argmax(spectrum_sum)
lmax = cube.l[kmax]
fmax = spectrum_sum[kmax]
c = 3e5 #km /s
delta_v = c * (cube.l-lmax)/lmax
plt.errorbar(delta_v, spectrum_sum, yerr=spectrum_sigma, capsize=3, color='k')
plt.xlabel(r'$\rm \Delta v / km \ s^{-1} $')
plt.ylabel('Flux / 10$^{-20}$ erg s$^{-1}$ cm$^{-2}$ $A^{-1}$')
plt.grid()
Bring the spectrum's flux to the [0,1] normalize so that it can be interpreted as a distribution function.
vmin = delta_v[0]
vmax = delta_v[-1]
dv = delta_v[1]-delta_v[0]
positive_spectrum = (spectrum_sum - np.min(spectrum_sum))/(np.max(spectrum_sum)-np.min(spectrum_sum))
positive_spectrum = positive_spectrum / dv / np.sum(positive_spectrum)
plt.plot(delta_v, positive_spectrum, color='k', marker='o')
plt.xlabel(r'$\rm \Delta v / km \ s^{-1} $')
plt.ylabel('f')
plt.grid()
Now use the rejection method to draw random numbers distributed according to $f$
# We are going to draw 30000 velocities distributed as dictated by the spectrum,
# so that we can use the Gaussian Mixtures model included in sklearn.
N = int(3e4)
v = []
while(len(v)<N):
vrand = vmin + (vmax-vmin) * np.random.rand(N)
frand = np.random.rand(N)
k, = np.where(frand < np.interp(vrand, delta_v, positive_spectrum))
v.extend(vrand[k])
v = np.asarray(v)
Compare the obtained random distribution with the original spectrum
bins = np.linspace(delta_v[0], delta_v[-1], 200)
yhist, xhist = np.histogram(v, bins=bins)
vmed = 0.5*(xhist[1:]+xhist[:-1])
synthetic_spectrum = yhist / (xhist[1:]-xhist[:-1]) /np.sum(yhist)
plt.plot(delta_v, positive_spectrum, ls='-')
plt.plot(vmed, synthetic_spectrum, ls='--')
plt.xlabel(r'$\rm \Delta v / km \ s^{-1} $')
plt.ylabel('f')
plt.grid()
Now with this synthetic data, we can use Gaussian Mixtures to determine the parameters of N Gaussians that fit the data.
from sklearn.mixture import GaussianMixture
N = np.arange(1,7)
models = []
AICs = []
for i in N:
model = GaussianMixture(i).fit(v.reshape(-1,1))
models.append(model)
AICs.append(model.aic(v.reshape(-1,1)))
#we pick the model that minimizes the AIC information criterion
best_model = models[np.argmin(AICs)]
#The total probability over all components
logprob = best_model.score_samples(vmed.reshape(-1,1))
pdf = np.exp(logprob)
plt.plot(vmed, pdf, color='k')
plt.plot(delta_v, positive_spectrum, ls='--')
#Now for the individual components
weight = best_model.predict_proba(vmed.reshape(-1,1))
for i in range(len(N)):
plt.plot(vmed, weight[:,i]*pdf, 'k', alpha=0.6)
plt.xlabel(r'$\rm \Delta v / km \ s^{-1} $')
plt.ylabel('f')
plt.grid()
Clearly, We need to be careful not to overfit the data. It is recommended to restrict the fitting range to the line, but I leave this to you :)