盒子
盒子

蛋白质结构预测之lgb的baseline

赛题:蛋白质结构预测挑战赛

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
################## utils.py #####################

def read_fa(file, mode='train'):
assert mode in {'train','test'}
labels = []
seqs_info = []
cates_id = []
seq = ''
with open(file,mode='r') as f:
line = f.readline().strip()
while line:
if line[0]=='>':
info = line[1:].split(' ')
cates_id.append(info[0])
if mode == 'train':
labels.append(''.join(info[1].split('.')[:2]))
if seq:
seqs_info.append(seq)
seq = ''
else:
seq += line
line = f.readline().strip()
seqs_info.append(seq)

return cates_id,seqs_info,labels

################## main.py #####################
from utils import *
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold as KFold
import lightgbm as lgb

train_file = './训练集/astral_train.fa'
test_file = './测试集/astral_test.fa'


train_sample_id, train_seqs_info, train_labels = read_fa(train_file, mode='train')
test_sample_id, test_seqs_info, _ = read_fa(test_file, mode='test')
train_data = {
'sample_id': train_sample_id,
'seq_info': train_seqs_info,
'label': train_labels
}

test_data = {
'sample_id': test_sample_id,
'seq_info': test_seqs_info,
}

label_map = {l:idx for idx,l in enumerate(set(train_labels))}

rev_label_map = {v:k for k,v in label_map.items()}

# print(label_map)

train = pd.DataFrame(data=train_data)
test = pd.DataFrame(data=test_data)

train['label'] = train['label'].map(label_map)

alp = list(set(''.join(train_seqs_info + test_seqs_info)))

train['seq_len'] = train['seq_info'].apply(lambda x:len(x))
test['seq_len'] = test['seq_info'].apply(lambda x:len(x))

for s in alp:
train['count_'+s] = train['seq_info'].apply(lambda x:x.count(s))
train['freq_'+s] = train['seq_info'].apply(lambda x:x.count(s)/len(x))

test['count_'+s] = test['seq_info'].apply(lambda x:x.count(s))
test['freq_'+s] = test['seq_info'].apply(lambda x:x.count(s)/len(x))


feats = [i for i in train.columns if i not in ['label','sample_id','seq_info']]

# print(feats)

x_train = train[feats]
y_train = train['label']
x_test = test[feats]


params = {
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_class': 245,
'metric': 'multi_error',
'num_leaves': 300,
'min_data_in_leaf': 500,
'learning_rate': 0.007,
'max_depth': 8,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'lambda_l1': 0.4,
'lambda_l2': 0.5,
'min_gain_to_split': 0.2,
'verbose': -1,
'num_threads':2,
}

# 五折交叉验证
folds = KFold(n_splits=5, shuffle=True, random_state=2021)

oof = np.zeros([len(x_train),245])
predictions = np.zeros([len(x_test),245])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = lgb.Dataset(x_train.iloc[trn_idx], y_train.iloc[trn_idx])
val_data = lgb.Dataset(x_train.iloc[val_idx], y_train.iloc[val_idx])

num_round = 1000
clf = lgb.train(params,
trn_data,
num_round,
valid_sets = [trn_data, val_data],
verbose_eval = 100,
early_stopping_rounds = 50)
oof[val_idx] = clf.predict(x_train.iloc[val_idx][feats], num_iteration=clf.best_iteration)
predictions += clf.predict(x_test, num_iteration=clf.best_iteration) / folds.n_splits
#print(predictions)

x_test['sample_id'] = test['sample_id']
x_test['category_id'] = [rev_label_map[list(x).index(max(x))] for x in predictions]
x_test['category_id'] = x_test['category_id'].apply(lambda x: x[0]+'.'+x[1:])
x_test[['sample_id', 'category_id']].to_csv('base_sub.csv', index=False)

y_pre = oof.argmax(axis=1)
print("F1 score: {}".format(f1_score(y_train, y_pre,average='micro')))
print("Precision score: {}".format(precision_score(y_train, y_pre,average='micro')))
print("Recall score: {}".format(recall_score(y_train, y_pre,average='micro')))

提交结果:目前【14/27(提交团队数)】

image-20210703193646587

主要是提取了氨基酸组成(AAC)特征,即一些简单的统计特征。没有考虑氨基酸之间的相对位置信息,也没有必要调参,最后预测结果也很是拉跨。

下一步直接尝试nlp 相关模型。

支持一下
  • 微信扫一扫
  • 支付宝扫一扫