Set up environment.

In [0]:
from google.colab import drive
import os
from PIL import Image
import cv2
import pandas as pd
import numpy as np
import scipy
import random
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Javascript, display, Markdown, clear_output

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


!pip install Augmentor
import Augmentor


import pickle

# Convenient Pickle wrappers 
def save(obj, file):
    pickle.dump(obj, open(SAVE + file ,'wb')) # SAVE is global variable for directory 

def load(file):
    return pickle.load(open(SAVE + file, 'rb'))

def RMSE(pred, truth):
    # Element-wise RMSE score. 
    return np.sqrt( np.mean( np.square( np.array(pred).flatten() - np.array(truth).flatten() )))

RANDOM = 42
random.seed(RANDOM)

display(Markdown("# Markdown is working"))
Requirement already satisfied: Augmentor in /usr/local/lib/python3.6/dist-packages (0.2.8)
Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.6/dist-packages (from Augmentor) (1.18.4)
Requirement already satisfied: future>=0.16.0 in /usr/local/lib/python3.6/dist-packages (from Augmentor) (0.16.0)
Requirement already satisfied: tqdm>=4.9.0 in /usr/local/lib/python3.6/dist-packages (from Augmentor) (4.41.1)
Requirement already satisfied: Pillow>=5.2.0 in /usr/local/lib/python3.6/dist-packages (from Augmentor) (7.0.0)

Markdown is working

Mounting Google Drive

It's for my Kaggle API key along with saving trained models, images, etc.

In [0]:
%%capture

display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'''))

# Mount Google Drive and load Kaggle API key into environment. 
# You'll have to customize this step for yourself. 
drive.mount('/content/gdrive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/School/CS497/kaggle/"

# The data is directories of jpg images. A lot of them. 
DATA = "/content/images_training_rev1/"
!mkdir /content/processed_64x64
!mkdir /content/processed_128x128
DIR_64 = '/content/processed_64x64/'
DIR_128 = '/content/processed_128x128/'
!mkdir /content/processed_64x64_WIDE
!mkdir /content/processed_128x128_WIDE
DIR_64_WIDE = '/content/processed_64x64_WIDE/'
DIR_128_WIDE = '/content/processed_128x128_WIDE/'



# Directory for saving models 
SAVE = "/content/gdrive/My Drive/School/CS497/GalaxyZoo/"
FIGS = "/content/gdrive/My Drive/School/CS497/GalaxyZoo/figs/"

Download and extract data.

In [0]:
%%capture

# Configure Kaggle to work with Colab and download Galaxy Zoo dataset. 
# https://www.kaggle.com/c/galaxy-zoo-the-galaxy-challenge 
# Original paper: https://arxiv.org/abs/1308.3496 
# If you are running this in your own, don't forget you have to log into the 
# website and manually accept the terms of use for the data. 
!pip uninstall -y kaggle
!pip install --upgrade pip
!pip install kaggle==1.5.6
!kaggle -v
!kaggle competitions download galaxy-zoo-the-galaxy-challenge

# Unpack training and validation datasets
!unzip /content/galaxy-zoo-the-galaxy-challenge.zip
!rm /content/galaxy-zoo-the-galaxy-challenge.zip
!unzip /content/images_training_rev1.zip
!unzip /content/training_solutions_rev1.zip

Load data into DataFrame

In [0]:
images = [f for f in os.listdir(DATA) if os.path.isfile(os.path.join(DATA, f))]
print("There are " + '{:,}'.format(len(images)) + " images in the dataset.")
labels = pd.read_csv('training_solutions_rev1.csv')
labels.GalaxyID = labels.GalaxyID.apply(lambda id: str(int(id)) + '.jpg')
save(labels, 'labels.p')
print("There are " + '{:,}'.format(labels.shape[0]) + " truth values.")
print("There are " + '{:,}'.format(labels.shape[1]-1) + " categories for classification.")
desc = ['Smooth','Featured or disc','Star or artifact','Edge on','Not edge on','Bar through center','No bar','Spiral','No Spiral','No bulge','Just noticeable bulge','Obvious bulge','Dominant bulge','Odd Feature','No Odd Feature','Completely round','In between','Cigar shaped','Ring (Oddity)','Lens or arc (Oddity)','Disturbed (Oddity)','Irregular (Oddity)','Other (Oddity)','Merger (Oddity)','Dust lane (Oddity)','Rounded bulge','Boxy bulge','No bulge','Tightly wound arms','Medium wound arms','Loose wound arms','1 Spiral Arm','2 Spiral Arms','3 Spiral Arms','4 Spiral Arms','More than four Spiral Arms',"Can't tell"]
There are 61,578 images in the dataset.
There are 61,578 truth values.
There are 37 categories for classification.

Utility functions

All of the utility or feature extraction functions are moved up into this cell for convenience.

In [0]:
def average_color(pic):
    '''
    pic is a 4 dimensional array where d0 is the index, d1 and d2 are the 
    x and y values of the image and d3 is the values of the pixels (R,G,B 
    in this dataset but it will scale to anything).

    I'm taking the mean value of pixels in each image by summing across 
    d1 and d2 then dividing by the total number of pixels. 
    '''
    return np.sum(pic, axis=(1,2)) / (pic.shape[1] * pic.shape[2])
    

def center_pixel(pic):
    return pic[:,int(pic.shape[1] / 2),int(pic.shape[2] / 2),:]
        

def image_generator(pics, path=DATA, batch_size=30, rotate=False, size=100, save=False,
                    retrieve=False):
    '''
    Generate batches of numpy arrays from a list of image filenames.
    DATA is a global variable with the path to the image folder.  

    Output array has 4 dimensions in this order: 
        - Index of pictures in this batch 
        - x dimension of pixels in individual image 
        - y dimension of pixels in individual image
        - Color depth (R,G,B in this project, but it can scale)
    '''
    l = len(pics)
    batches = int(l/batch_size)
    leftover = l % batch_size
    for batch in range(batches):
        start = batch * batch_size
        this_batch = pics[start:start+batch_size]
        if rotate:
            yield np.array([ 
                            scipy.misc.imresize(
                                scipy.ndimage.rotate(
                                plt.imread(path + pic, format='jpg'),
                                reshape=False,
                                angle=random.randint(0,360, random_seed=RANDOM)
                                ),
                                size=size
                            )
                            
                            for pic in this_batch])
        else:
            yield np.array([ plt.imread(path + pic, format='jpg') for pic in this_batch])
    start = batches * batch_size
    this_batch = pics[start:start+leftover]
    yield np.array([ plt.imread(path + pic, format='jpg') for pic in this_batch])

Exploratory Data Analysis

Labels

This dataset was hand-labeled for the Galaxy Zoo 2 project, using an online crowdsourcing tool. The data was downloaded from Kaggle's Galaxy Zoo Challenge competition via the Kaggle CLI tool. The original images in the Galazy Zoo dataset contain much more information than these jpg images, and that information was removed to focus the challenge around modelling the image analysis process of the humans viewing these images.

The labels represents a decision tree labeled with 37 classes of the format "ClassA.B", where A represents the level in the decision tree (from 1 to 11) and B represents the choices at the given level. This is different than a typical classification problem because each class represents an attribute, and most images will contain multiple attributes. The values represent the confidence of the crowded that a given answer is correct, from zero to one.

The decision process traverses forwards and backwards through a list of questions, and each questions and answer combination is represented as a class in the truth labels. The decision tree must always terminate with an END class, which includes 1.3, 6.2, and 7.x.

The decision tree described in the paper is as follows:

  1. Is the galaxy simply smooth and rounded, with no sign of a disk?
    • 1.1 Smooth? -> GOTO #7
    • 1.2 Featured or disc? -> GOTO #2
    • 1.3 Star or artifact? -> END
  2. Could this be a disk viewed edge-on?
    • 2.1 Yes -> GOTO #9
    • 2.2 No -> GOTO #3
  3. Is there a sign of a bar feature through the centre of the galaxy?
    • 3.1 Yes -> #4
    • 3.2 No -> #4
  4. Is there any sign of a spiral pattern?
    • 4.1 Yes -> #6
    • 4.2 No -> #6
  5. How prominent is the central bulge, compared with the rest of the galaxy?
    • 5.1 No bulge -> #6
    • 5.2 Just noticeable -> GOTO #6
    • 5.3 Obvious -> #6
    • 5.4 Dominant -> #6
  6. Is there anything odd?
    • 6.1 Yes -> #8
    • 6.2 No -> END
  7. How rounded is it?
    • 7.1 Completely round -> #6
    • 7.2 In between -> #6
    • 7.3 Cigar shaped -> #6
  8. Is the odd feature a ring, or is the galaxy distrubed or irregular?
    • 8.1 Ring -> END
    • 8.2 Lens or arc -> END
    • 8.3 Disturbed -> END
    • 8.4 Irregular -> END
    • 8.5 Other -> END
    • 8.6 Merger -> END
    • 8.7 Dust lane -> END
  9. Does the galaxy have a bulge at its centre? If so, what shape?
    • 9.1 Rounded -> #6
    • 9.2 Boxy -> #6
    • 9.3 No bulge -> #6
  10. How tightly wound do the spiral arms appear?
    • 10.1 Tight -> #11
    • 10.2 Medium -> #11
    • 10.3 Loose -> #11
  11. How many spiral arms are there?
    • 11.1 1 -> #5
    • 11.2 2 -> #5
    • 11.3 3 -> #5
    • 11.4 4 -> #5
    • 11.5 More than four -> #5
    • 11.6 Can't tell -> #5

Notice that for most of the questions, all responses lead to the same follow-up question. Unlike a typical decision tree, the destination isn't the defining characteristic of the object being analyzed. Rather, it's a process to build a list of attributes that describe a given galaxy.

In [0]:
title = 'Mean Confidence Level of All Classes'
plt.figure(title, figsize=(20, 5))
plt.suptitle(title, fontsize=20)
sns.barplot(x= labels.drop('GalaxyID', axis=1, inplace=False).mean().index, 
            y = labels.drop('GalaxyID', axis=1, inplace=False).mean().values)
plt.yticks(fontsize=16)
plt.xticks(fontsize=16, rotation=35, ha='right')
plt.show()
In [0]:
title = 'Instances Where Confidence Level is Above 0.5'
plt.figure(title, figsize=(20, 5))
plt.suptitle(title, fontsize=20)
sns.barplot(x= labels.columns[1:] ,
            y = np.where(labels.drop('GalaxyID', axis=1, inplace=False).to_numpy() >= 0.5, 1, 0).sum(axis=0))
plt.yticks(fontsize=16)
plt.xticks(fontsize=16, rotation=35, ha='right')
plt.show()
In [0]:
terminals = [2,14,18,19,20,21,22,23,24]
print("Sum of instances of terminal classes where the confidence level is above 0.5.\n")
for n,v in zip([labels.columns[i+1] for i in terminals], [np.where(labels.drop('GalaxyID', axis=1, inplace=False).to_numpy() >= 0.5, 1, 0).sum(axis=0)[i] for i in terminals] ):
  print("%s: %d" % (n,v))
Sum of instances of terminal classes where the confidence level is above 0.5.

Class1.3: 44
Class6.2: 53115
Class8.1: 886
Class8.2: 4
Class8.3: 16
Class8.4: 301
Class8.5: 202
Class8.6: 1022
Class8.7: 27

Odd or not?

In [0]:
odd = [np.where(labels.drop('GalaxyID', axis=1, inplace=False).to_numpy() >= 0.5, 1, 0).sum(axis=0)[i] for i in [13,14]] 
print("Odd: {:,} or {:0.1f}%".format(odd[0], 100*odd[0]/sum(odd)))
print("Not Odd: {:,} or {:0.1f}%".format(odd[1], 100*odd[1]/sum(odd)))
Odd: 8,484 or 13.8%
Not Odd: 53,115 or 86.2%

Looking at the data, we can see a few things. First, stars and artifacts are extremely rare in this dataset with only 44 intances. Pretty much 0%. This means that we won't be able to build a model that filters out anomalous data. Second, we see that the majority of galaxies are not odd. Only about 14% are odd and go to question 8 to check for type of oddity, and of the odd attributes most are rings, mergers, dust rings, or "other."

There's one question I'd really like to answer. Do the majority of crowdsourced labelers follow the same decision tree? If they do, the correlation matrix should tightly follow the logical progression of the decision tree. Class1.3 will correlate with nothing, Class6.1 (odd feature present) will correlate heavily with Class8.x (types of odd features) and very little with Class6.2 (no odd features), etc. This means looking at 37² feature pairs, which is a handful, but a correlation heatmap can make it more manageable to browse at-a-glance.

In [0]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'''))

def heatmap(df, title):
    plt.figure(title, figsize=[20,20])
    plt.title(title, fontsize=30)
    df_corr = df.corr()
    #df_corr = np.triu(df_corr, k=1)
    sns.heatmap(df_corr, vmax=0.6, square=True, annot=True, cmap='YlOrRd', cbar=False)
    plt.yticks(rotation = 45)
    plt.xticks(rotation = 45)
    plt.show()

heatmap(labels.corr(), 'Correlation Between Question Answers')

This heatmap gives an overview of the correlations between all answers to every question.

Here we can see that the relatively rare Class1.3 (which represents stars or other non-galaxies) is correlated with the path through Class1.1 to ambiguous shapes and odd attribtes. Labelers may have confused galaxies with odd appearances and stars. Things like this are a reminder that hand-labeled data doesn't represent the absolute truth, but rather an approximation of the truth. Remember that the challenge associated with this dataset is about modelling the human thought process behind answering these questions.

We can see from the negative correlation between Class6.1 and Class6.2 that disagreement over whether an images had an odd feature was rare. Though we do see some correlation between Class8.3/4/6 features, which means that there was some confusion between disturbed, irregular, and merged galaxies. There were only 16 insances with confidence in Class8.3, 301 with Class8.4, and 1022 with Class8.6. With this in mind, I would prioritize building a model that is effective at correctly predicting Class6.1 and Class8.4 and not worry too much about other the odd features. Class8.x represents a sub-problem that this dataset is not well-equipped to solve, and the loss function should reflect this.

Zooming into individual questions can show us how well humans can discern between different attributes.

In [0]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'''))

groups = [
          ['Class1.1', 'Class1.2', 'Class1.3'],
          ['Class2.1', 'Class2.2'],
          ['Class3.1', 'Class3.2'],
          ['Class4.1', 'Class4.2'],
          ['Class5.1', 'Class5.2', 'Class5.3', 'Class5.4'],
          ['Class6.1', 'Class6.2'],
          ['Class7.1', 'Class7.2', 'Class7.3'],
          ['Class8.1', 'Class8.2', 'Class8.3', 'Class8.4', 'Class8.5', 'Class8.6', 'Class8.7'],
          ['Class9.1','Class9.2','Class9.3'],
          ['Class10.1', 'Class10.2', 'Class10.3'],
          ['Class11.1','Class11.2','Class11.3','Class11.4','Class11.5','Class11.6']
]

size = 15
fig = plt.figure('Individual Question Heatmaps', figsize=[size,size*3/4])
for i, group in enumerate(groups):
    fig.add_subplot(3, 4, i+1)
    plt.title('Class' + str(i+1) + '.n') 
    sns.heatmap(labels[group].corr(), square=True, annot=True,  cmap='YlOrRd', cbar=False)
    plt.xticks([x + 0.5 for x in range(len(group))], labels=[s.replace('Class','') for s in group])
    plt.yticks([y + 0.5 for y in range(len(group))], labels=[s.replace('Class','') for s in group])
    #plt.xlabel(str(i))
    
fig.tight_layout(pad=3.0)

plt.show()