1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
| import numpy as np import pandas as pd from sklearn.utils import shuffle
def read_fa(file, mode='train'): assert mode in {'train','test'} labels = [] seqs_info = [] cates_id = [] seq = '' with open(file,mode='r') as f: line = f.readline().strip() while line: if line[0]=='>': info = line[1:].split(' ') cates_id.append(info[0]) if mode == 'train': label = ''.join(info[1].split('.')[:2] label = label[0]+'.'+label[1:] labels.append(label) if seq: seqs_info.append(seq) seq = '' else: seq += line line = f.readline().strip() seqs_info.append(seq) return cates_id,seqs_info,labels
def load_data(): train_file = '/kaggle/input/textfiles/astral_train.fa' test_file = '/kaggle/input/textfiles/astral_test.fa'
train_sample_id, train_seqs_info, train_labels = read_fa(train_file, mode='train') test_sample_id, test_seqs_info, _ = read_fa(test_file, mode='test') train_data = { 'sample_id': train_sample_id, 'seq_info': train_seqs_info, 'label': train_labels }
test_data = { 'sample_id': test_sample_id, 'seq_info': test_seqs_info, } train = pd.DataFrame(data=train_data) train = shuffle(train,random_state=2021).reset_index(drop=True) test = pd.DataFrame(data=test_data) return train,test
def split_windows(sentence,w = 3): new_sentence = [] for i in range(len(sentence)-w+1): new_sentence.append(sentence[i:i+w]) return new_sentence data = pd.concat(load_data(),ignore_index=True)
label2idx = { l:idx for idx, l in enumerate(data[~data['label'].isna()]['label'].unique().tolist())} idx2label = { idx:l for l,idx in label2idx.items()}
data['label'] = data['label'].map(label2idx) data['new_seq_info'] = data['seq_info'].apply(lambda x:split_windows(x,w = 1)) train,test = data[~data['label'].isna()].reset_index(drop=True),data[data['label'].isna()].reset_index(drop=True) max_features= 1000 max_len= 256 embed_size=128 batch_size = 24 epochs = 50
from keras.preprocessing.text import Tokenizer from keras.preprocessing import sequence
tokens = Tokenizer(num_words = max_features) tokens.fit_on_texts(list(data['new_seq_info']))
x_data = tokens.texts_to_sequences(data['new_seq_info']) x_data = sequence.pad_sequences(x_data, maxlen=max_len) x_train = x_data[:9472] y_train = data['label'][:9472] x_test = x_data[9472:]
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D from keras.callbacks import Callback from keras.callbacks import EarlyStopping,ModelCheckpoint from keras import initializers, regularizers, constraints, optimizers, layers, callbacks from keras.models import Model from keras.optimizers import Adam import keras sequence_input = Input(shape=(max_len, )) x = Embedding(max_features, embed_size, trainable=True)(sequence_input) x = SpatialDropout1D(0.2)(x) x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x) x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x) avg_pool = GlobalAveragePooling1D()(x) max_pool = GlobalMaxPooling1D()(x) x = concatenate([avg_pool, max_pool]) preds = Dense(245)(x)
model = Model(sequence_input, preds) model.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(1e-3), metrics=['accuracy']) model.fit(x_train, y_train, batch_size=batch_size, validation_split=0.2, epochs=epochs)
|