微软恶意攻击预测
微软的一个数据集给出电脑的特征信息包括编号版本等给出是否受攻击的标签,
通过这些数据训练一个模型可以预测电脑是否会收到攻击。这是题目要求,
评判标准是Roc曲线面积,输出类别概率。
记录我敲代码和思路过程和总结
首先正对数据集第一步要做的事对数据进行分析,也就是特征工程,分析数据的
时候发现数据中有很大一部分标签量,型号量等,这些量必须经过编码转化,才
能让模型识别,数据中有很多空值我选择填充方法是用均值来填,(试了其他的
都表现一般吧)。
模型我选择了罗辑回归,回归有两个参数一个是训练算法一个是正则化系数,
我选择梯度下降算法正则系数是1(不是很理想的系数)
上代码
以下是我的代码,代码有点糙训练大概需要一个小时左右,我看了排名第一的
Roc值是69%,我的ROc是65%有差距,
注释很详细
def reduce_mem_usage(df, verbose=True):#内存优化函数
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
dtypes = {
'MachineIdentifier': 'category',
'ProductName': 'category',
'EngineVersion': 'category',
'AppVersion': 'category',
'AvSigVersion': 'category',
'IsBeta': 'int8',
'RtpStateBitfield': 'float16',
'IsSxsPassiveMode': 'int8',
'DefaultBrowsersIdentifier': 'float16',
'AVProductStatesIdentifier': 'float32',
'AVProductsInstalled': 'float16',
'AVProductsEnabled': 'float16',
'HasTpm': 'int8',
'CountryIdentifier': 'int16',
'CityIdentifier': 'float32',
'OrganizationIdentifier': 'float16',
'GeoNameIdentifier': 'float16',
'LocaleEnglishNameIdentifier': 'int8',
'Platform': 'category',
'Processor': 'category',
'OsVer': 'category',
'OsBuild': 'int16',
'OsSuite': 'int16',
'OsPlatformSubRelease': 'category',
'OsBuildLab': 'category',
'SkuEdition': 'category',
'IsProtected': 'float16',
'AutoSampleOptIn': 'int8',
'PuaMode': 'category',
'SMode': 'float16',
'IeVerIdentifier': 'float16',
'SmartScreen': 'category',
'Firewall': 'float16',
'UacLuaenable': 'float32',
'Census_MDC2FormFactor': 'category',
'Census_DeviceFamily': 'category',
'Census_OEMNameIdentifier': 'float16',
'Census_OEMModelIdentifier': 'float32',
'Census_ProcessorCoreCount': 'float16',
'Census_ProcessorManufacturerIdentifier': 'float16',
'Census_ProcessorModelIdentifier': 'float16',
'Census_ProcessorClass': 'category',
'Census_PrimaryDiskTotalCapacity': 'float32',
'Census_PrimaryDiskTypeName': 'category',
'Census_SystemVolumeTotalCapacity': 'float32',
'Census_HasOpticalDiskDrive': 'int8',
'Census_TotalPhysicalRAM': 'float32',
'Census_ChassisTypeName': 'category',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16',
'Census_InternalPrimaryDisplayResolutionVertical': 'float16',
'Census_PowerPlatformRoleName': 'category',
'Census_InternalBatteryType': 'category',
'Census_InternalBatteryNumberOfCharges': 'float32',
'Census_OSVersion': 'category',
'Census_OSArchitecture': 'category',
'Census_OSBranch': 'category',
'Census_OSBuildNumber': 'int16',
'Census_OSBuildRevision': 'int32',
'Census_OSEdition': 'category',
'Census_OSSkuName': 'category',
'Census_OSInstallTypeName': 'category',
'Census_OSInstallLanguageIdentifier': 'float16',
'Census_OSUILocaleIdentifier': 'int16',
'Census_OSWUAutoUpdateOptionsName': 'category',
'Census_IsPortableOperatingSystem': 'int8',
'Census_GenuineStateName': 'category',
'Census_ActivationChannel': 'category',
'Census_IsFlightingInternal': 'float16',
'Census_IsFlightsDisabled': 'float16',
'Census_FlightRing': 'category',
'Census_ThresholdOptIn': 'float16',
'Census_FirmwareManufacturerIdentifier': 'float16',
'Census_FirmwareVersionIdentifier': 'float32',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsWIMBootEnabled': 'float16',
'Census_IsVirtualDevice': 'float16',
'Census_IsTouchEnabled': 'int8',
'Census_IsPenCapable': 'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable': 'float16',
'Wdft_IsGamer': 'float16',
'Wdft_RegionIdentifier': 'float16',
'HasDetections': 'int8'
}
nrows =4000000
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]
print("设置变量参数")
retained_columns = numerical_columns + categorical_columns
train = pd.read_csv('train.csv',
nrows = nrows,
usecols = retained_columns,
dtype=dtypes)
print("读取训练数据完成")
retained_columns += ['MachineIdentifier']
retained_columns.remove('HasDetections')
test = pd.read_csv('test.csv',
usecols = retained_columns,
dtype=dtypes)
print("读取测试数据完成")
indexer = {}
for col in categorical_columns:
_, indexer[col] = pd.factorize(train[col])
for col in categorical_columns:
if col == 'MachineIdentifier': continue
train[col] = indexer[col].get_indexer(train[col])
test[col] = indexer[col].get_indexer(test[col])
print("编码转化完成")
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
print("降内存完成")
target = train['HasDetections']
del train['HasDetections']
del train['MachineIdentifier']
imp = Imputer(missing_values='NaN', strategy='mean')
X_train, X_test, Y_train, Y_test = train_test_split(train, target, test_size=0.2, random_state=0)
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)
print("数据分割完成")
logreg = LogisticRegression(C=1)
logreg.fit(X_train_std, Y_train)
print("训练完成")
prepro = logreg.predict_proba(X_test_std)
acc = logreg.score(X_test_std,Y_test)
print("准确率:",str(acc))
idname=test['MachineIdentifier']
del test['MachineIdentifier']
test=imp.fit_transform(test)
test_std=sc.fit_transform(test)
print("测试数据处理完成")
prepro = logreg.predict_proba(test_std)
sub_df = pd.DataFrame({'MachineIdentifier':idname })
print("预测完成")
sub_df["HasDetections"] = list(prepro[:,1])
sub_df.to_csv('Submission.csv',index=False)
print("写入完成")
print(123)
可以优化的地方
1数据上
##数据进行降维,特征的相关性检验, ##因为特征有很大一部分是标签类特征所以在编码转化的时候需要注意,探索编码转化的方式 ##归一化活着标准化等 ##或者只是用部分特征,选择部分特征来训练,从实际问题上对特征进行整合 ##数据分布情况
2模型上
##主要是模型的参数,先不考虑本模型的适不适用,模型主要的参数需要改善的有 ##使用那种训练方法,c值的大小正则化, ##对数据增加权重,数据的分布情况