Meta Interview Question

Q. How would yhou approach high-dimensional dataset

Interview Answer

Anonymous

Nov 4, 2021

Dimensionality reduction. PCA? # evaluate pca with logistic regression algorithm for classification from numpy import mean from numpy import std from sklearn.datasets import make_classification from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression # define dataset X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7) # define the pipeline steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())] model = Pipeline(steps=steps) # evaluate model cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1) # report performance print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores))) LDA? # evaluate lda with logistic regression algorithm for classification from numpy import mean from numpy import std from sklearn.datasets import make_classification from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.pipeline import Pipeline from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.linear_model import LogisticRegression # define dataset X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7) # define the pipeline steps = [('lda', LinearDiscriminantAnalysis(n_components=1)), ('m', LogisticRegression())] model = Pipeline(steps=steps) # evaluate model cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1) # report performance print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))