盒子
盒子

蛋白质结构预测

赛题:蛋白质结构预测挑战赛

数据集一共包含245种折叠类型,11843条蛋白质序列样本,其中训练集中有9472个样本,测试集中有2371个样本。

继上次lgb的base模型 后,尝试过word2vec + 神经网络的方法,最后效果甚微。今天尝试了一下双向GRU模型,相比之前,有几个百分点的提高。

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

# 数据加载
def read_fa(file, mode='train'):
assert mode in {'train','test'}
labels = []
seqs_info = []
cates_id = []
seq = ''
with open(file,mode='r') as f:
line = f.readline().strip()
while line:
if line[0]=='>':
info = line[1:].split(' ')
cates_id.append(info[0])
if mode == 'train':
label = ''.join(info[1].split('.')[:2]
label = label[0]+'.'+label[1:]
labels.append(label)
if seq:
seqs_info.append(seq)
seq = ''
else:
seq += line
line = f.readline().strip()
seqs_info.append(seq)

return cates_id,seqs_info,labels

def load_data():
train_file = '/kaggle/input/textfiles/astral_train.fa'
test_file = '/kaggle/input/textfiles/astral_test.fa'

train_sample_id, train_seqs_info, train_labels = read_fa(train_file, mode='train')
test_sample_id, test_seqs_info, _ = read_fa(test_file, mode='test')

train_data = {
'sample_id': train_sample_id,
'seq_info': train_seqs_info,
'label': train_labels
}

test_data = {
'sample_id': test_sample_id,
'seq_info': test_seqs_info,
}

train = pd.DataFrame(data=train_data)
train = shuffle(train,random_state=2021).reset_index(drop=True)
test = pd.DataFrame(data=test_data)

return train,test

# 滑窗分词
def split_windows(sentence,w = 3):
new_sentence = []
for i in range(len(sentence)-w+1):
new_sentence.append(sentence[i:i+w])

return new_sentence

data = pd.concat(load_data(),ignore_index=True)
# label to idx
label2idx = { l:idx for idx, l in enumerate(data[~data['label'].isna()]['label'].unique().tolist())}
idx2label = { idx:l for l,idx in label2idx.items()}

data['label'] = data['label'].map(label2idx)
data['new_seq_info'] = data['seq_info'].apply(lambda x:split_windows(x,w = 1))
train,test = data[~data['label'].isna()].reset_index(drop=True),data[data['label'].isna()].reset_index(drop=True)
max_features= 1000
max_len= 256
embed_size=128
batch_size = 24
epochs = 50

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

tokens = Tokenizer(num_words = max_features)
tokens.fit_on_texts(list(data['new_seq_info']))

x_data = tokens.texts_to_sequences(data['new_seq_info'])
x_data = sequence.pad_sequences(x_data, maxlen=max_len)
x_train = x_data[:9472]
y_train = data['label'][:9472]
x_test = x_data[9472:]

from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D# Keras Callback Functions:
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.models import Model
from keras.optimizers import Adam
import keras
sequence_input = Input(shape=(max_len, ))
x = Embedding(max_features, embed_size, trainable=True)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
preds = Dense(245)(x)

model = Model(sequence_input, preds)
model.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=keras.optimizers.Adam(1e-3),
metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=batch_size,
validation_split=0.2,
epochs=epochs)

提交结果:目前【39/130(提交团队数)】

image-20210730140527003

支持一下
  • 微信扫一扫
  • 支付宝扫一扫