In [1]:
from typing import List, Optional, Dict, Union
import os
import random

import pandas as pd
import numpy as np
from pydantic import Field
from datatile.summary.df import DataFrameSummary
from sklearn.metrics import average_precision_score, f1_score, precision_score, recall_score, classification_report, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer

import tldextract

%load_ext autotime

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


time: 174 µs (started: 2022-09-26 14:40:06 -07:00)


# Initial Research
## Docs and Existed Solutions
1. [Domain generation algorithm / Detection](https://en.wikipedia.org/wiki/Domain_generation_algorithm#Detection)
2. ["Character Level based Detection of DGA Domain Names"](http://faculty.washington.edu/mdecock/papers/byu2018a.pdf) - a perfect review to step into this problem.
3. GAN perspective:
 - ["DeepDGA: Adversarially-Tuned Domain Generation and Detection"](https://arxiv.org/pdf/1610.01969.pdf) It has a [code](https://github.com/roreagan/DeepDGA).
 - ["MaskDGA: A Black-box Evasion Technique Against DGA Classifiers and Adversarial Defenses"](https://arxiv.org/pdf/1902.08909.pdf)
4. ["Inline Detection of Domain Generation Algorithms with Context-Sensitive Word Embeddings"](https://arxiv.org/pdf/1811.08705.pdf) - non-typical approach with splitting DN to words and applying a pretrained ELMO model.
5. https://github.com/jayjacobs/dga On R :(
6. https://github.com/andrewaeva/DGA 5 years old :( Has a big DGA data (18 MB)
7. https://github.com/nickwallen/botnet-dga-classifier On R. 6 years old :(
8. https://github.com/Daniellee1990/Language-Model-based-Detection-Approach-of-Algorithmically-Generated-Malicious-Domain-Names old methods :(


# Initial Approach

I'm selecting one of the existing solutions for the start, based on these criteria: 
- it should be "good enough" and "near state-of-art" (up-to-date)
- it should be fast to start, not complex for implementation

A good candidate is the 4th solution based on a pretrained model.

## Pro & Cons
### Pro
- It is simple enough to create a baseline solution and get the first result.
- There are many points of improvement including many pretrained models.
- Fast training, which allows the precise hyperparameter tuning.

### Cons
- No source code to start right away (but there is no good code base for other candidates either)


## Open Questions with current solutions and ways to experiments
1. `word2vec` vs. `ELMO` and `BERT`. Is there any "long text' correlation between "words" inside DNS domain names (DN)? If no such correlation, then we don't need the long text context hence word2vec would work just fine.
1. Splitting DN to words. Does it help? Verify this hypothesis. 
1. Word Embedding. The BPE encoding can be better. Verify with the FB FastText model (and BERT model).
1. Best Pretrained Model. The BERT or custom ELMO (flair pretrained) can be better. Verify with the pretrained ELMO, BERT and, maybe, train a custom ELMO model.
1. More complex trained classification layers. The paper didn't mention any hyperparameter tuning. If the proposed model trained fast, it makes sense to play with hyperparameters: several layers-units; normalization; dropout; batch size; learning rate progress; etc.
1. Classification objective. Do we need a multilabel classification while business requires only the binary classification? Does multilabel classification improve binary results?
1. Symmetrical Loss function. Are FPs and TPs important at the same rate? 
1. TLD. Is TLD helpful? 

# Data Preparations
* download correct DNs:
 1. http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
* download malignant DNs (clone these projects):
 1. https://github.com/baderj/domain_generation_algorithms
 1. https://github.com/liorsidi/Adversarial-DGA-Datasets
 
* download pretrained models (see code below): 
 1. word embedding, defined by the pretrained model. GloVe from [spaCy](https://spacy.io/models/en) : md model: 685k keys, 20k unique vectors (300 dimensions) or lg model: 685k keys, 685k unique vectors (300 dimensions); original GloVe. We will create an input data of fixed size (in number of words).
 2. BPE embedding: [flair](https://github.com/flairNLP/flair), [FastText](https://fasttext.cc/docs/en/python-module.html#text-classification-model); code examples: https://github.com/facebookresearch/fastText/tree/master/python/doc/examples, https://fasttext.cc/docs/en/english-vectors.html; models: https://fasttext.cc/docs/en/english-vectors.html. 
- Input Data: we convert the combined benign and malignant DNs into embedding from pretrained models
- Output Data: DGA code (multiclass probability) or Good/Bad (binary).

## Input raw data
Download http://s3.amazonaws.com/alexa-static/top-1m.csv.zip - 500K benign DNs.

Clone https://github.com/baderj/domain_generation_algorithms project with malignant DNs.

### benign

In [90]:
def read_data(file_name):
 ret = open(file_name, encoding='utf-8').read().splitlines()
 print(f'Load {len(ret):,} from file "{file_name}"')
 return ret
 
def remove_TLD(dns):
 return [el.split('.')[0] for el in dns]

file_name = 'data/Alex_top-0.5M.csv'
good_dn = read_data(file_name)

print(len(good_dn), good_dn[-5:])
good_dn = [el.split(',')[1] for el in good_dn]
print(len(good_dn), good_dn[-5:])

good_dn = remove_TLD(good_dn)
print(len(good_dn), good_dn[-5:])

good_dn = [{'dn': el, 'label': 'good'} for el in good_dn]
print(len(good_dn), good_dn[-5:])


Load 525,215 from file "data/Alex_top-0.5M.csv"
525215 ['525211,webtrics.ch', '525212,weshfee.com', '525213,yosofunny.com', '525214,yougogirlz.com', '525215,zehabesha.com']
525215 ['webtrics.ch', 'weshfee.com', 'yosofunny.com', 'yougogirlz.com', 'zehabesha.com']
525215 ['webtrics', 'weshfee', 'yosofunny', 'yougogirlz', 'zehabesha']
525215 [{'dn': 'webtrics', 'label': 'good'}, {'dn': 'weshfee', 'label': 'good'}, {'dn': 'yosofunny', 'label': 'good'}, {'dn': 'yougogirlz', 'label': 'good'}, {'dn': 'zehabesha', 'label': 'good'}]


### malignant

In [92]:
# clone https://github.com/baderj/domain_generation_algorithms project then:

dr = 'domain_generation_algorithms/*/example_domains.txt'

import glob

# root_dir needs a trailing slash (i.e. /root/dir/)
files = glob.iglob(dr, recursive=True)
len(list(files))

31

In [93]:
# files = ['data/banjori.txt', 'data/bazarbackdoor.txt', 'data/chinad.txt', 'data/corebot.txt']
bad_dn = []
for f in files:
 label = f.split('/')[1].split('.')[0]
 bad_dn += [{'dn': el, 'label': label} for el in remove_TLD(read_data(f))]
 
 
print(len(bad_dn), bad_dn[-5:])


0 []


In [29]:
def gen_domain(max):
 for domain in range(max):
 yield str(domain)
 
import pandas as pd

def write_file(mx):
 df = pd.DataFrame(data={"domain": gen_domain(mx)})
 df = df.drop_duplicates()
 df.to_csv("domains.csv", index=False)
 print("Saved in domains.csv")

mx = 100
write_file(mx) 
print("Saved in domains.csv")

Saved in domains.csv
Saved in domains.csv


In [21]:
import glob
import pandas as pd


def input_malignant(output_file, data_dir, nrows=None):
 domain_field = 'domain'
 pattern = f"{data_dir}/*/example_domains.txt"
 files = glob.iglob(pattern, recursive=True)
 print(f"{pattern=}")
 print(f"{files=}")
 out_df = pd.DataFrame()
 for file in files:
 file_name = file
 df = pd.read_csv(file_name, header=None, nrows=nrows, usecols=[0]).rename(columns={0: domain_field}) 
 print(f"Loaded {file}, {df.shape[0]:,}. Example: {df.loc[0, domain_field]}")
 df = df.drop_duplicates()
 print(f" Deduplicated: {df.shape[0]:,}") 
 
 # df['subdomain'] = df[domain_field].apply(extract_subdomain)
 # df = df.dropna(subset=['subdomain'])
 # df = df.drop_duplicates(subset=['subdomain'])
 # print(f" Deduplicated by subdomains: {df.shape[0]:,}") 
 
 out_df = pd.concat([out_df, df])
 print(f" Concatenated {df.shape[0]:,} -> {out_df.shape[0]:,}")
 
 print(f"Result {out_df.shape[0]:,}") 
 out_df = out_df.drop_duplicates(subset=[domain_field])
 print(f"Depuplicated by domains {out_df.shape[0]:,}") 
 output_file = f"{data_dir}/{output_file[:-4]}.{out_df.shape[0]//1_000_000}M.csv"
 out_df.to_csv(output_file, index=False)
 print(f"Saved {out_df.shape[0]:,} into {output_file}") 
 return out_df


data_dir = "domain_generation_algorithms" # /external
output_file = "malignant_domains.external.all.csv"
nrows = None
df = input_malignant(output_file, data_dir=data_dir, nrows=nrows)

pattern='domain_generation_algorithms/*/example_domains.txt'
files=<generator object _iglob at 0x7f163b1e4ac0>
Loaded domain_generation_algorithms/unnamed_downloader/example_domains.txt, 120. Example: ddknt.github.io
 Deduplicated: 60
 Concatenated 60 -> 60
Loaded domain_generation_algorithms/pitou/example_domains.txt, 306. Example: --------------+
 Deduplicated: 283
 Concatenated 283 -> 343
Loaded domain_generation_algorithms/chinad/example_domains.txt, 256. Example: 8f6bacmw30xxv6sc.cn
 Deduplicated: 256
 Concatenated 256 -> 599
Loaded domain_generation_algorithms/qadars/example_domains.txt, 200. Example: jk9enwhansl2.org
 Deduplicated: 200
 Concatenated 200 -> 799
Loaded domain_generation_algorithms/mydoom/example_domains.txt, 99. Example: qehspqnmrn.info
 Deduplicated: 99
 Concatenated 99 -> 898
Loaded domain_generation_algorithms/monerodownloader/example_domains.txt, 2,500. Example: 31b4bd31fg1x2.org
 Deduplicated: 2,500
 Concatenated 2,500 -> 3,398
Loaded domain_generation_algori

In [34]:
from glob import glob
[f.replace("Adversarial-DGA-Datasets/", "")[:-1] for f in glob("Adversarial-DGA-Datasets/*/", recursive = True)]

['RandomAttack',
 'SearchAttack',
 'MaskDGA',
 'AppendAttack',
 'unigrams_distribution',
 'CharBot',
 'DeepDGA']

## splitting DN to words (Optional)
Calculate the max word of the test data, we can use these max for defining the fixed size of the input windows, if we use the word embeddings

In [32]:
import wordninja

bad_len_max, good_len_max = max([len(wordninja.split(el['dn'])) for el in bad_dn]), max([len(wordninja.split(el['dn'])) for el in good_dn])
bad_len_max, good_len_max # (19, 33)

# we can use these max for defining the fixed size of the input windows, if we use the word embeddings.

(19, 33)

## SpaCy tokenizer, Embeddings

### GloVe

In [26]:
import spacy
from spacy.tokenizer import Tokenizer


class Model_vectors():
 def __init__(self, model_size='M'):
 models = {'S': 'en_core_web_sm', 'M': 'en_core_web_md', 'L': 'en_core_web_lg', 'XL': 'en_vectors_web_lg'}
 self.model = model_size
 self.nlp = spacy.load(models[self.model], disable=["tagger", "parser", 'ner'])
 self._vocab = self.nlp.vocab
 self._tokenizer = Tokenizer(self.nlp.vocab)
 self.get_vecs = self._get_spacy_vecs
 self.get_vec = self._get_spacy_vec
 print(f'Initialized "{models[self.model]}" model.')

 def is_oov(self, word):
 return False if word in self._vocab else True

 def remove_oov(self, text):
 """
 It takes text and remove (or replace) words that are out-of-vocabulary.
 Note: it splits text only with spaces.
 :param text: any text
 :return: text without oov words.
 """
 return (' '.join(str(w) for w in text.split(' ') if not self.is_oov(str(w)))).strip()

 def _get_spacy_vecs(self, texts):
 return [(doc.text, doc.vector) for doc in self._tokenizer.pipe(texts, batch_size=1000)]

 def _get_spacy_vec(self, text):
 return text, self._tokenizer(text).vector

 def doc_without_stop_words(self, text):
 return self.nlp(self.text_without_stop_words(text))

 def text_without_stop_words(self, text):
 return ' '.join([str(t).lower() for t in self.nlp(text) if not t.is_stop and not t.is_punct and not t.is_oov])


In [27]:
model_vectors = Model_vectors('M')

Initialized "en_core_web_md" model.


In [28]:
model_vectors.get_vecs(['text', 'title'])

[('text', array([ 0.037103 , -0.31259 , -0.17857 , 0.30001 , 0.078154 ,
 0.17958 , 0.12048 , -0.11879 , -0.20601 , 1.2849 ,
 -0.20409 , 0.80613 , 0.34344 , -0.19191 , -0.084511 ,
 0.17339 , 0.042483 , 2.0282 , -0.16278 , -0.60306 ,
 -0.53766 , 0.35711 , 0.22882 , 0.1171 , 0.42983 ,
 0.16165 , 0.407 , 0.036476 , 0.52636 , -0.13524 ,
 -0.016897 , 0.029259 , -0.079115 , -0.32305 , 0.052255 ,
 -0.3617 , -0.18355 , -0.34717 , -0.3691 , 0.16881 ,
 0.21018 , -0.38376 , -0.096909 , -0.36296 , -0.37319 ,
 0.0021152, 0.32512 , 0.063977 , 0.36249 , -0.26935 ,
 -0.59341 , -0.13625 , 0.016425 , -0.2474 , -0.07498 ,
 0.034708 , -0.01476 , -0.11648 , 0.25559 , -0.35002 ,
 -0.52707 , 0.21221 , 0.062456 , 0.26184 , 0.53149 ,
 0.34957 , -0.22692 , 0.44076 , 0.4438 , 0.6335 ,
 -0.049757 , -0.08134 , 0.65618 , -0.4716 , 0.090675 ,
 -0.084873 , 0.31455 , -0.38495 , -0.19247 , 0.48064 ,
 0.26688 , 0.095743 , 0.13024 , 0.37023 , 0.46269 ,
 -0.32844 , 0.17375 , -0.36325 , 0.30672 , -0.075042 ,
 -0.64684 , -0.

### ELMO embedding

In [41]:
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, CharacterEmbeddings
from flair.embeddings import BertEmbeddings, ELMoEmbeddings, CharLMEmbeddings

In [34]:
glove_emb = WordEmbeddings('glove')

I0915 17:48:32.024530 21016 utils.py:422] loading Word2VecKeyedVectors object from C:\Users\leo_g\.flair\embeddings\glove.gensim
I0915 17:48:33.275043 21016 utils.py:461] loading vectors from C:\Users\leo_g\.flair\embeddings\glove.gensim.vectors.npy with mmap=None
I0915 17:48:33.492771 21016 utils.py:494] setting ignored attribute vectors_norm to None
I0915 17:48:33.493755 21016 utils.py:428] loaded C:\Users\leo_g\.flair\embeddings\glove.gensim


In [36]:
sent = Sentence('This is a system integration.')
glove_emb.embed(sent)
_ = [print(t, t.embedding.shape) for t in sent]

Token: 1 This torch.Size([100])
Token: 2 is torch.Size([100])
Token: 3 a torch.Size([100])
Token: 4 system torch.Size([100])
Token: 5 integration. torch.Size([100])


In [38]:
sent = Sentence(' '.join(wordninja.split('igjyestnessbiophysicalohax')))
glove_emb.embed(sent)
_ = [print(t, t.embedding.shape) for t in sent]

Token: 1 ig torch.Size([100])
Token: 2 j torch.Size([100])
Token: 3 yest torch.Size([100])
Token: 4 ness torch.Size([100])
Token: 5 bio torch.Size([100])
Token: 6 physical torch.Size([100])
Token: 7 oha torch.Size([100])
Token: 8 x torch.Size([100])


In [46]:
sent = Sentence(' '.join(wordninja.split('igjyestnessbiophysicalohax')))
glove_emb.embed(sent)
sent[0].embedding.shape, sent[0].embedding

(torch.Size([100]),
 tensor([ 0.3876, 0.1173, -0.1786, -0.5518, -1.3703, -1.7363, -0.4303, 0.2215,
 -0.1897, -0.2305, -0.2904, -0.9843, -0.6379, 0.8521, -0.4181, 0.7708,
 0.1105, -0.6815, -0.1501, 0.0790, 1.1104, -0.1524, -0.0356, -0.5496,
 0.6761, -1.1164, 0.3294, 0.3358, -0.1761, 0.3174, 0.2326, -0.1749,
 -0.1531, -0.2616, -0.1671, -0.1590, 0.9442, 0.3235, -0.1124, -0.2102,
 -0.2055, 0.2575, -0.3094, 0.0053, -0.3699, 0.0685, 0.1430, -0.0455,
 -0.2702, 0.4782, 0.3559, -0.3921, -0.2094, 0.0851, -1.3836, 0.4357,
 -0.4373, 0.6922, -0.2560, 0.4856, 0.7953, -0.4402, -0.6072, 0.8645,
 0.6602, -0.1795, -0.2840, 0.6075, 1.1740, 0.0348, 0.0285, 0.1810,
 -0.8015, -1.0579, -0.3046, -0.5177, -0.3481, -1.2305, -0.9520, 0.0256,
 0.6460, 0.2348, -0.8135, -0.4275, -0.2730, -0.3802, 0.5882, 0.0488,
 0.7306, -0.4720, 0.7308, -0.0583, 0.1116, 0.0297, 1.1622, 0.2942,
 1.0848, 0.7977, 0.2797, 0.7269], device='cuda:0'))

In [43]:
flair_emb = FlairEmbeddings('d:/Program Files/flair/embeddings/lm-news-english-forward-v0.2rc.pt')

In [45]:
sent = Sentence(' '.join(wordninja.split('igjyestnessbiophysicalohax')))
flair_emb.embed(sent)
sent[0].embedding.shape, sent[0].embedding

(torch.Size([2048]),
 tensor([ 1.4106e-05, 3.7738e-02, -5.4646e-02, ..., -1.2723e-04,
 1.2393e-01, -2.3951e-03], device='cuda:0'))

In [48]:
bert_base_emb = BertEmbeddings('bert-base-uncased')


I0915 19:33:33.334479 21016 tokenization_utils.py:380] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\leo_g\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0915 19:33:33.884137 21016 configuration_utils.py:157] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at C:\Users\leo_g\.cache\torch\transformers\4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0915 19:33:33.885138 21016 configuration_utils.py:174] Model config {
 "architectures": [
 "BertForMaskedLM"
 ],
 "attention_probs_dropout_prob": 0.1,
 "finetuning_task": null,
 "hidden_act": "gelu",
 "hidden_dropout_prob": 0.1,
 "hidden_size": 768,
 "initializer_range": 0.02,
 "intermediate_size": 3072,
 "is_decod

In [49]:
sent = Sentence(' '.join(wordninja.split('igjyestnessbiophysicalohax')))
bert_base_emb.embed(sent)
sent[0].embedding.shape, sent[0].embedding

(torch.Size([3072]),
 tensor([-0.4745, 0.2342, 0.8207, ..., -0.2806, 0.5300, 0.3944],
 device='cuda:0'))

# flair models

## classification example

In [5]:
from flair.data import Corpus
from flair.datasets import TREC_6
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [None]:
# # 1. get the corpus
# corpus: Corpus = TREC_6()

In [5]:
ds = corpus.test
dir(ds)

['__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_parse_line_to_sentence',
 'in_memory',
 'is_in_memory',
 'label_prefix',
 'max_chars_per_doc',
 'max_tokens_per_doc',
 'path_to_file',
 'sentences',
 'tokenizer',
 'total_sentence_count']

In [6]:
# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

2020-09-17 08:35:48,357 Computing label dictionary. Progress:


100%|██████████████████████████████████████████████████████████████████████████| 4907/4907 [00:00<00:00, 213334.54it/s]

2020-09-17 08:35:48,394 [b'ENTY', b'DESC', b'NUM', b'ABBR', b'LOC', b'HUM']





In [7]:
# len(label_dict), type(label_dict), dir(label_dict)
from torch.optim.adam import Adam

from flair.data import Corpus
from flair.datasets import TREC_6
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer


AttributeError: module 'flair' has no attribute 'nn'

In [9]:
# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('glove')]

I0917 08:37:05.399397 11616 utils.py:422] loading Word2VecKeyedVectors object from C:\Users\leo_g\.flair\embeddings\glove.gensim
I0917 08:37:06.428536 11616 utils.py:461] loading vectors from C:\Users\leo_g\.flair\embeddings\glove.gensim.vectors.npy with mmap=None
I0917 08:37:06.633977 11616 utils.py:494] setting ignored attribute vectors_norm to None
I0917 08:37:06.634947 11616 utils.py:428] loaded C:\Users\leo_g\.flair\embeddings\glove.gensim


In [15]:
# len(word_embeddings), type(word_embeddings), dir(word_embeddings)

In [11]:
# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)

In [14]:
# type(document_embeddings), dir(document_embeddings)

In [17]:
# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [18]:
# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

In [19]:
# 7. start the training
trainer.train('resources/taggers/trec',
 learning_rate=0.1,
 mini_batch_size=32,
 anneal_factor=0.5,
 patience=5,
 max_epochs=150)

2020-09-17 08:40:57,074 ----------------------------------------------------------------------------------------------------
2020-09-17 08:40:57,076 Model: "TextClassifier(
 (document_embeddings): DocumentRNNEmbeddings(
 (embeddings): StackedEmbeddings(
 (list_embedding_0): WordEmbeddings('glove')
 )
 (word_reprojection_map): Linear(in_features=100, out_features=100, bias=True)
 (rnn): GRU(100, 256, batch_first=True)
 (dropout): Dropout(p=0.5, inplace=False)
 )
 (decoder): Linear(in_features=256, out_features=6, bias=True)
 (loss_function): CrossEntropyLoss()
)"
2020-09-17 08:40:57,077 ----------------------------------------------------------------------------------------------------
2020-09-17 08:40:57,094 Corpus: "Corpus: 4907 train + 545 dev + 500 test sentences"
2020-09-17 08:40:57,096 ----------------------------------------------------------------------------------------------------
2020-09-17 08:40:57,097 Parameters:
2020-09-17 08:40:57,098 - learning_rate: "0.1"
2020-09-17 08:

{'test_score': 0.916,
 'dev_score_history': [0.411,
 0.3743,
 0.4294,
 0.4147,
 0.5817,
 0.5358,
 0.6422,
 0.6349,
 0.7193,
 0.7046,
 0.6312,
 0.7394,
 0.7394,
 0.7358,
 0.7284,
 0.7706,
 0.7963,
 0.8,
 0.7725,
 0.778,
 0.7945,
 0.789,
 0.7523,
 0.8312,
 0.8257,
 0.8404,
 0.8367,
 0.8294,
 0.8092,
 0.8202,
 0.844,
 0.8385,
 0.822,
 0.8294,
 0.8587,
 0.8606,
 0.8385,
 0.8202,
 0.8606,
 0.8532,
 0.8697,
 0.8422,
 0.8569,
 0.855,
 0.8679,
 0.8734,
 0.8569,
 0.8606,
 0.8881,
 0.8716,
 0.8477,
 0.8771,
 0.8679,
 0.8789,
 0.8789,
 0.8789,
 0.8752,
 0.8807,
 0.8936,
 0.9009,
 0.8789,
 0.8807,
 0.8624,
 0.8899,
 0.8716,
 0.8862,
 0.8936,
 0.8826,
 0.8862,
 0.8899,
 0.8881,
 0.8899,
 0.8936,
 0.8972,
 0.8991,
 0.8881,
 0.8881,
 0.8972,
 0.8972,
 0.8954,
 0.8954,
 0.8991,
 0.8936,
 0.8954,
 0.8954,
 0.8991,
 0.8936,
 0.8954,
 0.8899,
 0.8917,
 0.8936,
 0.8936,
 0.8936,
 0.8936,
 0.8954,
 0.8954,
 0.8936,
 0.8917,
 0.8954,
 0.8954,
 0.8954,
 0.8954,
 0.8954,
 0.8954,
 0.8954,
 0.8954,
 0.8954,
 0

In [23]:
classifier = TextClassifier.load('resources/taggers/trec/final-model.pt')

# create example sentence
from flair.data import Sentence

sentence = Sentence('Who built the Eiffel Tower ?')

# predict class and print
classifier.predict(sentence)

print(sentence.labels)

2020-09-17 09:08:26,787 loading file resources/taggers/trec/final-model.pt
[HUM (0.9996542930603027)]


## prepare data files

Malignant data spread between several sets with different size for different DGA algorithms. We need stratification for these algorithms between train, dev, and test samples. So we split each DGA set into train, dev, and test and ONLY after that compound samples of all DGA algos into the result train, dev, and test sample sets.

DNs split by pseudo-words by wordninja package. Using pseudo-words inplace of DN we can use the models with word embeddings.
We prepare two data sets: one with samples split by pseudo-words and the second without such spliting. Then we experiment with both data sets.

Samples compound of a lablel and a sample text (DN in form of pseudo-words), where label is in the FastText format (as "__label__<label> <sample text>").

In [3]:
import wordninja

def split(txt):
 return ' . '.join(' '.join(wordninja.split(t)) for t in txt.split('.'))

split('myshopify.biz.com')

'my shop if y . biz . com'

In [4]:
# benign data

def read_data(file_name):
 ret = open(file_name, encoding='utf-8').read().splitlines()
 print(f'Load {len(ret):,} from file "{file_name}"')
 return ret
 
def remove_TLD(dns):
 return [el.split('.')[0] for el in dns]


def add_label(in_file, out_file, label, split_by_words=True):
 lines = read_data(in_file)
 if split_by_words: lines = [split(el) for el in lines]
 lines = [f'__label__{label} {el}\n' for el in lines]
 with open(out_file, 'w') as f:
 f.writelines(lines)
 print(f'Saved {len(lines):,} into "{out_file}"')

in_file = 'data/Alex_top-0.5M.txt'
label = 'binign'
split_by_words=True
out_file = f'{in_file[:-4]}.{label}.{"split_by_words." if split_by_words else ""}txt'
add_label(in_file, out_file, 'binign', split_by_words=split_by_words)


In [101]:
# malitious data
#

def read_data(file_name, start, end):
 ret = open(file_name, encoding='utf-8').read().splitlines()
 print(f' Load {len(ret):,} from file "{file_name}"')
 st, en = int(start*len(ret)), int(end*len(ret))
 print(f' Get: [{st:,}:{en:,}]')
 return ret[st:en]

def read(f, start, end):
 ret = []
 if type(f) == list:
 for ff in f:
 ret += read_data(ff, start=start, end=end)
 elif type(f) == str:
 ret = read_data(f, start=start, end=end)
 return ret
 
def add_label(in_file, out_file, label, split_by_words=True, start=0, end=0.8):
 lines = read(in_file, start=start, end=end)
 if split_by_words: lines = [split(el) for el in lines]
 lines = [f'__label__{label} {el}\n' for el in lines]
 with open(out_file, 'w') as f:
 f.writelines(lines)
 print(f'Saved {len(lines):,} into "{out_file}"')

dr = 'domain_generation_algorithms/*/example_domains.txt'

import glob

in_file = list(glob.iglob(dr, recursive=True))

print('malignant files:', len(in_file), )

for split_by_words in [True, False]:
 for label, start, end in [('train', 0, 0.8), ('dev', 0.8, 0.9), ('test', 0.9, 1)]:
 out_file = f'data/malignant.{"split_by_words." if split_by_words else ""}{label}.txt'
 add_label(in_file, out_file, label, split_by_words=split_by_words, start=start, end=end)

# Saved 18,119 into "data/malignant.train.txt"

# Saved 2,265 into "data/malignant.dev.txt"

# Saved 2,273 into "data/malignant.test.txt"

malignant files: 31
 Load 1,000 from file "domain_generation_algorithms\banjori\example_domains.txt"
 Get: [0:800]
 Load 2,160 from file "domain_generation_algorithms\bazarbackdoor\example_domains.txt"
 Get: [0:1,728]
 Load 256 from file "domain_generation_algorithms\chinad\example_domains.txt"
 Get: [0:204]
 Load 40 from file "domain_generation_algorithms\corebot\example_domains.txt"
 Get: [0:32]
 Load 30 from file "domain_generation_algorithms\dircrypt\example_domains.txt"
 Get: [0:24]
 Load 5 from file "domain_generation_algorithms\dnschanger\example_domains.txt"
 Get: [0:4]
 Load 12 from file "domain_generation_algorithms\gozi\example_domains.txt"
 Get: [0:9]
 Load 8 from file "domain_generation_algorithms\locky\example_domains.txt"
 Get: [0:6]
 Load 2,500 from file "domain_generation_algorithms\monerodownloader\example_domains.txt"
 Get: [0:2,000]
 Load 99 from file "domain_generation_algorithms\mydoom\example_domains.txt"
 Get: [0:79]
 Load 2,048 from file "domain_generation_algo

In [68]:
in_file = 'data/Alex_top-0.5M.txt'
label = 'benign'
split_by_words=True
out_file = f'{in_file[:-4]}.{label}.{"split_by_words." if split_by_words else ""}train.txt'
add_label(in_file, out_file, label, split_by_words=split_by_words, start=0, end=0.8)

Load 525,215 from file "data/Alex_top-0.5M.txt"
 Get: [0:420,172]
Saved 420,172 into "data/Alex_top-0.5M.binign.split_by_words.train.txt"


In [71]:
# compound
def compound_files(in_files, out_file):
 with open(out_file, 'w') as outfile:
 for fname in in_files:
 with open(fname) as infile:
 outfile.write(infile.read())
 print(f'Saved "{out_file}"')
 
vars = ['dev', 'test', 'train', 'split_by_words.dev', 'split_by_words.test', 'split_by_words.train']
_ = [compound_files([f'data/{el}.{var}.txt' for el in ['Alex_top-0.5M.benign', 'malignant']], f'data/data.{var}.txt') for var in vars]


Saved "data/data.dev.txt"
Saved "data/data.test.txt"
Saved "data/data.train.txt"
Saved "data/data.split_by_words.dev.txt"
Saved "data/data.split_by_words.test.txt"
Saved "data/data.split_by_words.train.txt"


## Prepare Corpuses

In [102]:
from flair.data import Corpus
from flair.datasets import ClassificationCorpus

corpus_no_split: Corpus = ClassificationCorpus('data',
 test_file='data.test.txt',
 dev_file='data.dev.txt',
 train_file='data.train.txt')

2020-09-17 14:05:55,290 Reading data from data
2020-09-17 14:05:55,292 Train: data\data.train.txt
2020-09-17 14:05:55,293 Dev: data\data.dev.txt
2020-09-17 14:05:55,295 Test: data\data.test.txt


In [1]:
from flair.data import Corpus
from flair.datasets import ClassificationCorpus

corpus_word_split: Corpus = ClassificationCorpus('data',
 test_file='data.split_by_words.test.txt',
 dev_file='data.split_by_words.dev.txt',
 train_file='data.split_by_words.train.txt')


2020-09-17 21:23:21,636 Reading data from data
2020-09-17 21:23:21,639 Train: data\data.split_by_words.train.txt
2020-09-17 21:23:21,640 Dev: data\data.split_by_words.dev.txt
2020-09-17 21:23:21,642 Test: data\data.split_by_words.test.txt


## Train models

In [2]:
from flair.data import Corpus
from flair.datasets import TREC_6
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

### GloVe / no_split

In [103]:
# 2. create the label dictionary
label_dict = corpus_no_split.make_label_dictionary()

2020-09-17 14:44:26,883 Computing label dictionary. Progress:


100%|████████████████████████████████████████████████████████████████████████| 422936/422936 [02:02<00:00, 3460.80it/s]

2020-09-17 14:47:32,717 [b'benign', b'malignant']





In [77]:
# len(label_dict), type(label_dict), dir(label_dict)

In [104]:
word_embeddings = [WordEmbeddings('glove')]

document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
trainer = ModelTrainer(classifier, corpus_no_split)

I0917 14:47:44.510528 11616 utils.py:422] loading Word2VecKeyedVectors object from C:\Users\leo_g\.flair\embeddings\glove.gensim
I0917 14:47:48.508447 11616 utils.py:461] loading vectors from C:\Users\leo_g\.flair\embeddings\glove.gensim.vectors.npy with mmap=None
I0917 14:47:48.596435 11616 utils.py:494] setting ignored attribute vectors_norm to None
I0917 14:47:48.598442 11616 utils.py:428] loaded C:\Users\leo_g\.flair\embeddings\glove.gensim


In [105]:
trainer.train('models/glove.no_split',
 learning_rate=0.1,
 mini_batch_size=256, # 32,
 anneal_factor=0.5,
 patience=3,
 max_epochs=30)

# 2020-09-17 15:23:05,197 loading file models\glove.no_split\best-model.pt
# 2020-09-17 15:24:17,229 0.9935	0.9935	0.9935
# 2020-09-17 15:24:17,230 
# MICRO_AVG: acc 0.987 - f1-score 0.9935
# MACRO_AVG: acc 0.4968 - f1-score 0.49835
# benign tp: 52522 - fp: 346 - fn: 0 - tn: 0 - precision: 0.9935 - recall: 1.0000 - accuracy: 0.9935 - f1-score: 0.9967
# malignant tp: 0 - fp: 0 - fn: 346 - tn: 52522 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000


2020-09-17 14:48:48,172 ----------------------------------------------------------------------------------------------------
2020-09-17 14:48:48,174 Model: "TextClassifier(
 (document_embeddings): DocumentRNNEmbeddings(
 (embeddings): StackedEmbeddings(
 (list_embedding_0): WordEmbeddings('glove')
 )
 (word_reprojection_map): Linear(in_features=100, out_features=100, bias=True)
 (rnn): GRU(100, 256, batch_first=True)
 (dropout): Dropout(p=0.5, inplace=False)
 )
 (decoder): Linear(in_features=256, out_features=2, bias=True)
 (loss_function): CrossEntropyLoss()
)"
2020-09-17 14:48:48,177 ----------------------------------------------------------------------------------------------------
2020-09-17 14:48:48,179 Corpus: "Corpus: 422936 train + 52867 dev + 52868 test sentences"
2020-09-17 14:48:48,181 ----------------------------------------------------------------------------------------------------
2020-09-17 14:48:48,183 Parameters:
2020-09-17 14:48:48,185 - learning_rate: "0.1"
2020-09-

{'test_score': 0.9935,
 'dev_score_history': [0.9935, 0.9935, 0.9935, 0.9935],
 'train_loss_history': [0.04442076490379541,
 0.03973632867696497,
 0.039671257708426756,
 0.0396350922786416],
 'dev_loss_history': [tensor(0.0511, device='cuda:0'),
 tensor(0.0512, device='cuda:0'),
 tensor(0.0512, device='cuda:0'),
 tensor(0.0513, device='cuda:0')]}

### GloVe / word_split

In [106]:
model_name = 'glove'
model_var = 'word_split'
corpus = corpus_word_split

import os

model_dir = f'models/{model_name}.{model_var}'
try:
 os.mkdir(model_dir)
except OSError:
 print (f"Creation of the directory '{model_dir}' failed.")
else:
 print (f"Created the directory '{model_dir}'")

label_dict = corpus.make_label_dictionary()
word_embeddings = [WordEmbeddings(model_name)]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
trainer = ModelTrainer(classifier, corpus)

trainer.train(f'models/{model_name}.{model_var}',
 learning_rate=0.1,
 mini_batch_size=256, # 32,
 anneal_factor=0.5,
 patience=3,
 max_epochs=30)
# MICRO_AVG: acc 0.9947 - f1-score 0.9973
# MACRO_AVG: acc 0.7949 - f1-score 0.8713500000000001
# benign tp: 52522 - fp: 141 - fn: 0 - tn: 205 - precision: 0.9973 - recall: 1.0000 - accuracy: 0.9973 - f1-score: 0.9986
# malignant tp: 205 - fp: 0 - fn: 141 - tn: 52522 - precision: 1.0000 - recall: 0.5925 - accuracy: 0.5925 - f1-score: 0.7441
# 20

Created the directory models/glove.word_split 
2020-09-17 15:31:13,744 Computing label dictionary. Progress:


100%|████████████████████████████████████████████████████████████████████████| 422936/422936 [02:02<00:00, 3454.79it/s]

2020-09-17 15:34:06,947 [b'benign', b'malignant']



I0917 15:34:06.951880 11616 utils.py:422] loading Word2VecKeyedVectors object from C:\Users\leo_g\.flair\embeddings\glove.gensim
I0917 15:34:08.144447 11616 utils.py:461] loading vectors from C:\Users\leo_g\.flair\embeddings\glove.gensim.vectors.npy with mmap=None
I0917 15:34:08.233437 11616 utils.py:494] setting ignored attribute vectors_norm to None
I0917 15:34:08.234394 11616 utils.py:428] loaded C:\Users\leo_g\.flair\embeddings\glove.gensim


2020-09-17 15:34:08,343 ----------------------------------------------------------------------------------------------------
2020-09-17 15:34:08,345 Model: "TextClassifier(
 (document_embeddings): DocumentRNNEmbeddings(
 (embeddings): StackedEmbeddings(
 (list_embedding_0): WordEmbeddings('glove')
 )
 (word_reprojection_map): Linear(in_features=100, out_features=100, bias=True)
 (rnn): GRU(100, 256, batch_first=True)
 (dropout): Dropout(p=0.5, inplace=False)
 )
 (decoder): Linear(in_features=256, out_features=2, bias=True)
 (loss_function): CrossEntropyLoss()
)"
2020-09-17 15:34:08,346 ----------------------------------------------------------------------------------------------------
2020-09-17 15:34:08,347 Corpus: "Corpus: 422936 train + 52867 dev + 52868 test sentences"
2020-09-17 15:34:08,348 ----------------------------------------------------------------------------------------------------
2020-09-17 15:34:08,350 Parameters:
2020-09-17 15:34:08,351 - learning_rate: "0.1"
2020-09-

{'test_score': 0.9973,
 'dev_score_history': [],
 'train_loss_history': [],
 'dev_loss_history': []}

In [107]:
model_name = 'glove'
corpus = corpus_word_split
lr = 0.1
batch_size = 32
model_var = f'word_split_{lr}_{batch_size}'

import os

model_dir = f'models/{model_name}.{model_var}'
try:
 os.mkdir(model_dir)
except OSError:
 print (f"Creation of the directory '{model_dir}' failed.")
else:
 print (f"Created the directory '{model_dir}'")

label_dict = corpus.make_label_dictionary()
word_embeddings = [WordEmbeddings(model_name)]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
trainer = ModelTrainer(classifier, corpus)

trainer.train(model_dir,
 learning_rate=lr,
 mini_batch_size=batch_size, # 32,
 anneal_factor=0.5,
 patience=3,
 max_epochs=30)

# models\glove.word_split_0.5_256\best-model.pt
# MICRO_AVG: acc 0.9947 - f1-score 0.9973
# MACRO_AVG: acc 0.7949 - f1-score 0.8713500000000001
# benign tp: 52522 - fp: 141 - fn: 0 - tn: 205 - precision: 0.9973 - recall: 1.0000 - accuracy: 0.9973 - f1-score: 0.9986
# malignant tp: 205 - fp: 0 - fn: 141 - tn: 52522 - precision: 1.0000 - recall: 0.5925 - accuracy: 0.5925 - f1-score: 0.7441


# ******************** The BEST ***************************************************************
# ... training not finished
# 2020-09-17 19:17:51,743 loading file models\glove.word_split_0.1_32\best-model.pt
# 2020-09-17 19:19:07,756 0.9998	0.9998	0.9998
# MICRO_AVG: acc 0.9996 - f1-score 0.9998
# MACRO_AVG: acc 0.9844 - f1-score 0.9921
# benign tp: 52513 - fp: 2 - fn: 9 - tn: 344 - precision: 1.0000 - recall: 0.9998 - accuracy: 0.9998 - f1-score: 0.9999
# malignant tp: 344 - fp: 9 - fn: 2 - tn: 52513 - precision: 0.9745 - recall: 0.9942 - accuracy: 0.9690 - f1-score: 0.9843


Created the directory 'models/glove.word_split_0.1_32'
2020-09-17 15:59:57,686 Computing label dictionary. Progress:


100%|████████████████████████████████████████████████████████████████████████| 422936/422936 [01:49<00:00, 3879.39it/s]

2020-09-17 16:02:37,984 [b'benign', b'malignant']



I0917 16:02:37.986963 11616 utils.py:422] loading Word2VecKeyedVectors object from C:\Users\leo_g\.flair\embeddings\glove.gensim
I0917 16:02:39.216480 11616 utils.py:461] loading vectors from C:\Users\leo_g\.flair\embeddings\glove.gensim.vectors.npy with mmap=None
I0917 16:02:39.284472 11616 utils.py:494] setting ignored attribute vectors_norm to None
I0917 16:02:39.285509 11616 utils.py:428] loaded C:\Users\leo_g\.flair\embeddings\glove.gensim


2020-09-17 16:02:39,303 ----------------------------------------------------------------------------------------------------
2020-09-17 16:02:39,305 Model: "TextClassifier(
 (document_embeddings): DocumentRNNEmbeddings(
 (embeddings): StackedEmbeddings(
 (list_embedding_0): WordEmbeddings('glove')
 )
 (word_reprojection_map): Linear(in_features=100, out_features=100, bias=True)
 (rnn): GRU(100, 256, batch_first=True)
 (dropout): Dropout(p=0.5, inplace=False)
 )
 (decoder): Linear(in_features=256, out_features=2, bias=True)
 (loss_function): CrossEntropyLoss()
)"
2020-09-17 16:02:39,306 ----------------------------------------------------------------------------------------------------
2020-09-17 16:02:39,308 Corpus: "Corpus: 422936 train + 52867 dev + 52868 test sentences"
2020-09-17 16:02:39,310 ----------------------------------------------------------------------------------------------------
2020-09-17 16:02:39,311 Parameters:
2020-09-17 16:02:39,312 - learning_rate: "0.1"
2020-09-

{'test_score': 0.9998,
 'dev_score_history': [0.9995,
 0.9998,
 0.9997,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998],
 'train_loss_history': [0.006704615072667729,
 0.0018764245461755271,
 0.001428390409741658,
 0.001281433457158759,
 0.001160885160813556,
 0.0009288391432520336,
 0.0008205886772528867,
 0.0007256656202444912,
 0.0006219205977933742,
 0.0007342968501146098,
 0.0005951481113686868,
 0.0006219105104267317,
 0.0005906871863142453,
 0.0006297557100612004],
 'dev_loss_history': [tensor(0.0020, device='cuda:0'),
 tensor(0.0008, device='cuda:0'),
 tensor(0.0010, device='cuda:0'),
 tensor(0.0009, device='cuda:0'),
 tensor(0.0006, device='cuda:0'),
 tensor(0.0005, device='cuda:0'),
 tensor(0.0005, device='cuda:0'),
 tensor(0.0005, device='cuda:0'),
 tensor(0.0005, device='cuda:0'),
 tensor(0.0005, device='cuda:0'),
 tensor(0.0004, device='cuda:0'),
 tensor(0.0005, device='cuda:0'),
 tensor(0.0006, device='cuda:0'),
 tensor(0

### greedy conclusion
The GloVe experiments show that splitting DNs to words works much better. So, we will use only data sets with word-splitting in the next experiments.

### en (FastText) / word_split
[CLASSIC_WORD_EMBEDDINGS](https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md)

In [7]:
# emb = WordEmbeddings('en')

2020-09-17 20:48:51,533 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-news-300d-1M.vectors.npy not found in cache, downloading to D:\Temp\tmp8nn7o21t


100%|██████████████████████████████████████████████████████████████| 1200000128/1200000128 [07:17<00:00, 2743514.45B/s]

2020-09-17 20:56:09,623 copying D:\Temp\tmp8nn7o21t to cache at C:\Users\leo_g\.flair\embeddings\en-fasttext-news-300d-1M.vectors.npy





2020-09-17 20:56:11,760 removing temp file D:\Temp\tmp8nn7o21t
2020-09-17 20:56:12,478 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-news-300d-1M not found in cache, downloading to D:\Temp\tmpbf28rx8w


100%|██████████████████████████████████████████████████████████████████| 54600983/54600983 [00:19<00:00, 2804971.30B/s]

2020-09-17 20:56:32,619 copying D:\Temp\tmpbf28rx8w to cache at C:\Users\leo_g\.flair\embeddings\en-fasttext-news-300d-1M





2020-09-17 20:56:32,711 removing temp file D:\Temp\tmpbf28rx8w


I0917 20:56:32.719858 11228 utils.py:422] loading Word2VecKeyedVectors object from C:\Users\leo_g\.flair\embeddings\en-fasttext-news-300d-1M
I0917 20:56:35.019429 11228 utils.py:461] loading vectors from C:\Users\leo_g\.flair\embeddings\en-fasttext-news-300d-1M.vectors.npy with mmap=None
I0917 20:56:35.518429 11228 utils.py:494] setting ignored attribute vectors_norm to None
I0917 20:56:35.519431 11228 utils.py:428] loaded C:\Users\leo_g\.flair\embeddings\en-fasttext-news-300d-1M


In [3]:
import os

# 'en' (or 'en-news' or 'news')	English	FastText embeddings over news and wikipedia data

model_name = 'en'

corpus = corpus_word_split
lr = 0.1
batch_size = 32
model_var = f'word_split_{lr}_{batch_size}'


model_dir = f'models/{model_name}.{model_var}'
try:
 os.mkdir(model_dir)
except OSError:
 print (f"Creation of the directory '{model_dir}' failed.")
else:
 print (f"Created the directory '{model_dir}'")

label_dict = corpus.make_label_dictionary()
word_embeddings = [WordEmbeddings(model_name)]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
trainer = ModelTrainer(classifier, corpus)

trainer.train(model_dir,
 learning_rate=lr,
 mini_batch_size=batch_size, # 32,
 anneal_factor=0.5,
 patience=3,
 max_epochs=30)

# 2020-09-18 02:26:11,258 loading file models\en.word_split_0.1_32\best-model.pt
# 2020-09-18 02:27:51,902 	0.9999
# 2020-09-18 02:27:52,074 
# Results:
# - F-score (micro) 0.9999
# - F-score (macro) 0.9956
# - Accuracy 0.9999

# By class:
# precision recall f1-score support

# benign 0.9999 1.0000 0.9999 52522
# malignant 0.9942 0.9884 0.9913 346

# micro avg 0.9999 0.9999 0.9999 52868
# macro avg 0.9971 0.9942 0.9956 52868
# weighted avg 0.9999 0.9999 0.9999 52868
# samples avg 0.9999 0.9999 0.9999 52868


Creation of the directory 'models/en.word_split_0.1_32' failed.
2020-09-17 21:24:03,844 Computing label dictionary. Progress:


100%|████████████████████████████████████████████████████████████████████████| 475804/475804 [01:54<00:00, 4172.25it/s]

2020-09-17 21:27:03,865 [b'benign', b'malignant']



I0917 21:27:03.870359 7756 utils.py:422] loading Word2VecKeyedVectors object from C:\Users\leo_g\.flair\embeddings\en-fasttext-news-300d-1M
I0917 21:27:06.158385 7756 utils.py:461] loading vectors from C:\Users\leo_g\.flair\embeddings\en-fasttext-news-300d-1M.vectors.npy with mmap=None
I0917 21:27:08.727932 7756 utils.py:494] setting ignored attribute vectors_norm to None
I0917 21:27:08.728893 7756 utils.py:428] loaded C:\Users\leo_g\.flair\embeddings\en-fasttext-news-300d-1M


2020-09-17 21:27:13,074 ----------------------------------------------------------------------------------------------------
2020-09-17 21:27:13,076 Model: "TextClassifier(
 (document_embeddings): DocumentRNNEmbeddings(
 (embeddings): StackedEmbeddings(
 (list_embedding_0): WordEmbeddings('en')
 )
 (word_reprojection_map): Linear(in_features=300, out_features=300, bias=True)
 (rnn): GRU(300, 256, batch_first=True)
 (dropout): Dropout(p=0.5, inplace=False)
 )
 (decoder): Linear(in_features=256, out_features=2, bias=True)
 (loss_function): CrossEntropyLoss()
 (beta): 1.0
 (weights): None
 (weight_tensor) None
)"
2020-09-17 21:27:13,076 ----------------------------------------------------------------------------------------------------
2020-09-17 21:27:13,078 Corpus: "Corpus: 422936 train + 52867 dev + 52868 test sentences"
2020-09-17 21:27:13,079 ----------------------------------------------------------------------------------------------------
2020-09-17 21:27:13,081 Parameters:
2020-0

{'test_score': 0.9999,
 'dev_score_history': [0.9997,
 0.9998,
 0.9998,
 0.9997,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9998,
 0.9999,
 0.9998,
 0.9998,
 0.9998,
 0.9999,
 0.9998,
 0.9999,
 0.9999,
 0.9998,
 0.9999,
 0.9999,
 0.9999,
 0.9999,
 0.9999],
 'train_loss_history': [0.007005688341932229,
 0.0013865314515874637,
 0.0010426083367080436,
 0.0010144814260991692,
 0.0009920659119428336,
 0.0010587797197936935,
 0.0008621925794808552,
 0.0008075173992434479,
 0.000731073996334196,
 0.0007000347918036083,
 0.0007187440749044018,
 0.000748671631027228,
 0.000783193307360384,
 0.0006957890033008677,
 0.0006514761315378964,
 0.0006951858777576511,
 0.0005949204746242241,
 0.0006549592268721019,
 0.0006606985747614043,
 0.0005890207203470428,
 0.0005931733206512098,
 0.0005345358036626473,
 0.0006067134716228813,
 0.000516379041056304,
 0.0005498284672124913,
 0.00048060644434001716,
 0.0005666411932743337,
 0.00046890764830

## Further Development and Research (OUTDATED)
1. Additional Data Sources:
 1. Feedback data from the production systems: Predicted TPs and TNs. Can we discover simple heuristics/statistics?
1. The complexity of the existed DGAs and neural networks. Is there a correlation between them? Can we estimate the complexity of the DGA in terms of a number of parameters (or any other NN complexity measurements)?
1. Can we group the DGA by algo groups? If YES, can we train different NNs for different DGA groups and use an ensemble of the models?

In [6]:
def totalScore(num, blocks):
 # WRITE YOUR CODE HERE
 if not blocks: 
 return 0 
 score, prev_score, prev_prev_score = 0, 0, 0
 for el in blocks:
 if type(el) == int:
 score, prev_score, prev_prev_score = prev_score + el, el, prev_score
 elif el == 'X':
 score, prev_score, prev_prev_score = prev_score*2, prev_score*2, prev_score 
 elif el == '+':
 score, prev_score, prev_prev_score = prev_score + prev_prev_score, prev_score + prev_prev_score, prev_score
 elif and el == 'Z':
 score, prev_score, prev_prev_score = 
 return score
 


tests = [
 ([], 0)
 ([5,-2, 4], 7),
 ([1], 1),
 (['X'], 0),
 ([1, 'X'], 2),
 ([5,-2, 4], 4),([1, 2, '+', 'Z'], 3),
 ([5,-2,4,'Z','X',9, '+', '+'], 27),
]

_ = [print(t, totalScore(len(t), t), res) for t, res in tests if totalScore(len(t), t) != res]

[5, -2, 4, 'Z', 'X', 9, '+', '+'] 18 27
[1, 'X'] 1 2
[1, 2, '+', 'Z'] 0 3


# "character as a feature" approach 

(like in the DNS-Tunnel detector)

Additional changes:
* URL parsing with the `tldextract`
* Additional benign domains from the DNS-Tunnel project

Steps:
* prepare data
 - benign 
 - malitious
 
* preprocess data
* train

## Prepare data

### benign data

Data were prepared in the `researches/notebooks/domain_data.ipynb`, `Preprocess data/benign` section.

>Saved 28,709,889 into ../data/benign_domains/external/benign_domains.external.all.11M.csv

In [6]:
from pathlib import Path

bening_file = "data/benign/benign_domains.external.all.11M.csv"
Path(bening_file).exists()

True

### malignant data

In [46]:
from pathlib import Path

dga_synthetic_file = "data/domain_generation_algorithms/malignant_domains.dga_generated.43_algos.4M.csv"
dga_file = 'data/Adversarial-DGA-Datasets/Adversarial-DGA-Datasets.41K.csv'
Path(dga_synthetic_file).exists(), Path(dga_file).exists()

(True, True)

time: 5.29 ms (started: 2022-09-19 11:10:44 -07:00)


### Concatenate all with labels

In [10]:
from pathlib import Path
import pandas as pd
import tldextract
from sklearn.model_selection import train_test_split


def concatenate_all(files, out_dir, nrows=100):
 df_res = pd.DataFrame()
 for el in files:
 df = pd.read_csv(el["file"], nrows=nrows)
 print(f" Loaded {df.shape[0]:,} {el['file']}, columns: {list(df.columns)}")
 
 if not el['is_domain_extracted']:
 print(f" Domain extraction...") 
 df["domain"] = df["domain"].apply(lambda d: tldextract.extract(d).domain)
 df['label'] = el['label']
 print(f" Added the label: {el['label']}")
 
 df_res = pd.concat([df_res, df])
 print(f" Concatenated {df.shape[0]:,} -> {df_res.shape[0]:,}")
 
 df_res = df_res.drop_duplicates(subset='domain') # Preference to the first samples (to bening)!!!
 print(f" Deduplicated domains: {df_res.shape[0]:,}") 

 df = df_res
 df = shuffle(df).reset_index(drop=True)
 print('='*40)
 print(f"Shuffled: {df.shape[0]:,}")
 
 out_file = f"{out_dir}/data.csv"
 df.to_csv(out_file, index=False)
 print(f"Saved {df.shape[0]:,} {out_file}")
# X_train, X_tmp, y_train, y_tmp = train_test_split(
# df['domain'], df['label'], test_size=0.20, random_state=42, 
# stratify=df['label'], shuffle=True)
# X_dev, X_test, y_dev, y_test = train_test_split(
# X_tmp, y_tmp, test_size=0.50, random_state=42, 
# stratify=y_tmp, shuffle=True)
 
# print(f"X_train: {len(X_train):,}, y_train: {len(y_train):,}, samples: {X_train.iloc[0]} - {y_train.iloc[0]}")
# print(f"X_dev: {len(X_dev):,}, y_dev: {len(y_dev):,}, samples: {X_dev.iloc[0]} - {y_dev.iloc[0]}")
# print(f"X_test: {len(X_test):,}, y_test: {len(y_test):,}, samples: {X_test.iloc[0]} - {y_test.iloc[0]}")
 
# for name, x, y in zip(['train', 'dev', 'test'], [X_train, X_dev, X_test], [y_train, y_dev, y_test]):
# out_file = f"{out_dir}/x_{name}.csv"
# x.to_csv(out_file, index=False)
# print(f"Saved {len(x):,} {out_file}, name: {x.name}")
 
# out_file = f"{out_dir}/y_{name}.csv"
# y.to_csv(out_file, index=False)
# print(f"Saved {len(y):,} {out_file}, name: {y.name}")
 return df

# all files have a single "domain" column:
# it has the prefix part ('.com')
files = [
 {"label": "bening", 
 "file": "data/benign/benign_domains.external.all.11M.csv", 
 "is_domain_extracted": False
 },
 {"label": "malignant", 
 "file": "data/domain_generation_algorithms/malignant_domains.dga_generated.43_algos.4M.csv", 
 "is_domain_extracted": True
 },
 {"label": "malignant", 
 "file": "data/Adversarial-DGA-Datasets/Adversarial-DGA-Datasets.41K.csv", 
 "is_domain_extracted": False},
]

out_dir = 'data/training_data'
df = concatenate_all(files, out_dir, nrows=None)


 Loaded 11,937,220 data/benign/benign_domains.external.all.11M.csv, columns: ['domain']
 Domain extraction...
 Added the label: bening
 Concatenated 11,937,220 -> 11,937,220
 Deduplicated domains: 7,749,605
 Loaded 4,524,427 data/domain_generation_algorithms/malignant_domains.dga_generated.43_algos.4M.csv, columns: ['domain']
 Added the label: malignant
 Concatenated 4,524,427 -> 12,274,032
 Deduplicated domains: 12,272,225
 Loaded 41,584 data/Adversarial-DGA-Datasets/Adversarial-DGA-Datasets.41K.csv, columns: ['domain']
 Domain extraction...
 Added the label: malignant
 Concatenated 41,584 -> 12,313,809
 Deduplicated domains: 12,310,512
Shuffled: 12,310,512
X_train: 9,848,409, y_train: 9,848,409, samples: konzrdsk - malignant
X_dev: 1,231,051, y_dev: 1,231,051, samples: rwnhwhrwha - malignant
X_test: 1,231,052, y_test: 1,231,052, samples: justinmoorhouse - bening
Saved 9,848,409 data/training_data/x_train.csv, name: domain
Saved 9,848,409 data/training_data/y_train.csv, name: label
Sa

In [84]:
out_file = f"{out_dir}/data.csv"
df.to_csv(out_file, index=False)
print(f"Saved {df.shape[0]:,} {out_file}")

Saved 1,000 data/training_data/bytes_features/data.csv
time: 8.85 ms (started: 2022-09-23 08:48:05 -07:00)


In [98]:
# what is the domain string length distribution?
df_2 = df.copy(deep=True)
df_2['len'] = df_2['domain'].str.len()

time: 2.62 s (started: 2022-09-23 09:26:27 -07:00)


In [117]:
df_2[(df_2["len"] < 4)]

Unnamed: 0,domain,label,len
706,3mf,bening,3.0
1477,jr7,bening,3.0
3460,a4g,bening,3.0
5198,mwi,bening,3.0
5858,3sz,bening,3.0
...,...,...,...
12308406,l-o,bening,3.0
12308564,bjm,bening,3.0
12309336,2hz,bening,3.0
12309891,tia,bening,3.0


time: 34.5 ms (started: 2022-09-23 09:50:14 -07:00)


In [115]:
df_2[(df_2["len"] < 6) & (df_2["label"] == "malignant")]

Unnamed: 0,domain,label,len
5833227,ddwul,malignant,5.0
6934420,lyran,malignant,5.0
7030327,gaton,malignant,5.0
11719763,mamet,malignant,5.0


time: 414 ms (started: 2022-09-23 09:49:06 -07:00)


In [120]:
df_lens = df_2['len'].value_counts()

time: 109 ms (started: 2022-09-23 10:00:27 -07:00)


In [127]:
df_lens[df_lens < 50000]

25.0 46169
29.0 28235
26.0 25370
27.0 25243
3.0 24404
31.0 22386
37.0 20707
38.0 20681
30.0 20274
39.0 18118
36.0 14799
32.0 14038
40.0 12101
35.0 11187
33.0 10939
34.0 8690
41.0 6344
42.0 2768
2.0 1208
43.0 966
44.0 334
45.0 105
46.0 55
1.0 37
49.0 14
56.0 13
48.0 9
50.0 9
47.0 9
60.0 7
54.0 7
63.0 6
61.0 6
62.0 6
58.0 4
55.0 3
51.0 3
53.0 3
57.0 2
59.0 1
Name: len, dtype: int64

time: 8.56 ms (started: 2022-09-23 10:02:27 -07:00)


In [None]:
# conclusion:
 ## the domains with len < 26 keep most of the samples.

## old model, tokens based

### preprocessing

In [37]:
from transformers import DistilBertTokenizer
from collections import Counter

def extract_features(s: str, max_length: int = 14, tokenizer=None) -> List[int]:
 """
 It is the same transformation that was applied on the training samples.
 If we change it, we need to change it on the training samples and retrain a model!
 Transform a string into a sequence of tokens; then into the lengths of the tokens;
 then into the counters of lengths from 1 to 14.
 Tokens with lengths > 14 counted as 14 length. It is super rare when a token has a length > 14.
 Example:
 returns: [0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 - here a string transformed into 5 tokens; 3 tokens with length of 2 and 2 tokens - length of 3.
 Additional internal transformations:
 * [optional] replace all digits to '9' digit (it didn't show an improvement in the model performance).
 * remove the '##' markers because we don't care if the token is on the word border or not
 * remove the service tokens ([CLS], [SEP]) and the '.' token

 Parameters
 ----------
 s - a string to be parsed
 replace_digits - do we need to replace all digits to '9' digit?
 max_length - an upper limit of a token length

 Returns
 -------
 a list of the counters of the token lengths

 """
 # if replace_digits:
 # s = s.translate(self._digit_replacer)
 d = dict(
 Counter(
 [
 len(t.replace("##", ""))
 for t in tokenizer.convert_ids_to_tokens(tokenizer(s)["input_ids"])
 if t not in ["[CLS]", "[SEP]", "."]
 ]
 )
 )
 features = [d[i] if i in d else 0 for i in range(1, 19)]
 features_cut = features[: max_length - 1] + [sum(features[max_length:])]
 return features_cut

def prepare_old_features(df, nrows=None):
 tokenizer = DistilBertTokenizer.from_pretrained("tokenizer/")
 df = df[:nrows].copy(deep=True)
 df['features'] = df['domain'].apply(lambda d: extract_features(d, max_length=14, tokenizer=tokenizer))
 df['y'] = df['label'].apply(lambda d: 0 if d == 'bening' else 1)
 out_file = "data/training_data/old_features/data.csv"
 df.to_csv(out_file, index=False)
 print(f"Saved {df.shape[0]:,} {out_file}") 
 return df


out_dir = 'data/training_data/old_features/'


Saved 300 data/training_data/old_features/data.csv
time: 90.8 ms (started: 2022-09-21 17:08:14 -07:00)


In [63]:
nrows = None
df_res = prepare_old_features(df, nrows=nrows)

Saved 12,310,512 data/training_data/old_features/data.csv
time: 19min 7s (started: 2022-09-21 17:41:33 -07:00)


### training

In [64]:
model = CatBoostClassifier(loss_function='MultiClass') # Logloss, MultiClass

size = df_res.shape[0]
train_size, dev_size, test_size = int(size*0.8), int(size*0.9), size
print(f"{train_size=}, {dev_size=}, {test_size=}")
X_train, y_train = np.array(list(df_res.loc[0:train_size, 'features'])), df_res.loc[0:train_size, 'y'] 
X_dev, y_dev = np.array(list(df_res.loc[train_size:dev_size, 'features'])), df_res.loc[train_size:dev_size, 'y']
X_test, y_test, domain_test = np.array(list(df_res.loc[dev_size:, 'features'])), df_res.loc[dev_size:, 'y'], df_res.loc[size*2:size*3, 'domain']

model.fit(X_train, y_train, logging_level='Silent', eval_set=(X_dev, y_dev))


train_size=9848409, dev_size=11079460, test_size=12310512


<catboost.core.CatBoostClassifier at 0x7f7794b2dca0>

time: 9min 39s (started: 2022-09-21 18:00:41 -07:00)


In [65]:
y_pred = model.predict_proba(X_test)

time: 2.37 s (started: 2022-09-21 18:11:26 -07:00)


### Results

In [6]:
def results(y_true, y_scores, y_pred=None):
 if not y_pred:
 y_pred = [1 if sc > 0.5 else 0 for sc in y_scores]
 support = len(y_true)
 print(f"support: {support:}")
 print(f"average_precision_score: {average_precision_score(y_true, y_scores):.3}")
 roc_auc_score_val = roc_auc_score(y_true, y_scores)
 print(f"roc_auc_score: {roc_auc_score_val:.3}")
 print(f"classification_report: \n{classification_report(y_true, y_pred)}")

time: 1.27 ms (started: 2022-09-26 12:26:22 -07:00)


In [66]:
y_scores = [y[1] for y in y_pred]

results(y_test, y_scores)

# support: 1_231_052
# average_precision_score: 0.942
# roc_auc_score: 0.959
# classification_report: 
# precision recall f1-score support

# 0 0.91 0.95 0.93 775348
# 1 0.90 0.84 0.87 455704

# accuracy 0.91 1231052
# macro avg 0.90 0.89 0.90 1231052
# weighted avg 0.91 0.91 0.90 1231052

# support: 10000
# average_precision_score: 0.942
# roc_auc_score: 0.957
# classification_report: 
# precision recall f1-score support

# 0 0.90 0.94 0.92 6222
# 1 0.90 0.83 0.87 3778

# accuracy 0.90 10000
# macro avg 0.90 0.89 0.89 10000
# weighted avg 0.90 0.90 0.90 10000

# support: 100
# average_precision_score: 0.881
# roc_auc_score: 0.888
# classification_report: 
# precision recall f1-score support

# 0 0.91 0.91 0.91 67
# 1 0.82 0.82 0.82 33

# accuracy 0.88 100
# macro avg 0.86 0.86 0.86 100
# weighted avg 0.88 0.88 0.88 100



support: 1231052
average_precision_score: 0.942
roc_auc_score: 0.959
classification_report: 
 precision recall f1-score support

 0 0.91 0.95 0.93 775348
 1 0.90 0.84 0.87 455704

 accuracy 0.91 1231052
 macro avg 0.90 0.89 0.90 1231052
weighted avg 0.91 0.91 0.90 1231052

time: 1.83 s (started: 2022-09-21 18:11:35 -07:00)


In [70]:
y_scores = [y[1] for y in y_pred]
aps = average_precision_score(y_test, y_scores)
out_file_name = f'models/catboost.{aps:.3}.tokens.model'

model.save_model(out_file_name)
print(f'Saved {out_file_name}')


Saved models/catboost.0.942.tokens.model
time: 335 ms (started: 2022-09-21 18:19:18 -07:00)


## model "bytes as a feature"

### preprocessing

In [8]:
def extract_features(s: str, pad_char: str = "=", max_len=20) -> str:
 """
 It converts a string into the features.
 Features are the bytes decoded from the string.
 Features are formatted as the '99 111 108 111 ...' string.
 Features is the fixed-size array.
 The feature array is taken from the middle of the string if the string is longer than the feature_num;
 the string is padded with a pad char if it is shorter than the feature_num.
 Parameters
 ----------
 s - a subdomain name, like 'player' in 'player.my-gaming.com'
 Returns
 -------
 s - a feature array in form of the string '99 111 108 111 ...'.
 """
 # encode to the byte array:
 if isinstance(s, bytes):
 s = s.decode()
 elif not isinstance(s, (str, bytes)):
 raise ValueError(s, "Only str or bytearray type can be processed by the _extract_features().")

 s_len = len(s)
 # cut or pad a string
 if s_len < max_len:
 s = s + pad_char * (max_len - s_len)
 elif s_len > max_len:
 delta = (s_len - max_len) // 2
 s = s[delta : (max_len + delta)]

 return [ord(b) for b in s]

def prepare_features(df, out_dir=None, max_len=20, nrows=None):
 print(f"{df.shape=} {max_len=} {nrows=} ")
 df = df[:nrows].copy(deep=True)
 print(f"Start with {df.shape[0]:,} samples")
 df = df.dropna()
 print(f"After dropna {df.shape[0]:,} samples")
 df['features'] = df['domain'].apply(lambda d: extract_features(d, max_len=max_len))
 df['y'] = df['label'].apply(lambda d: 0 if d == 'bening' else 1)
 out_file = f"{out_dir}/data.csv"
 df.to_csv(out_file, index=False)
 print(f"Saved {df.shape[0]:,} {out_file}") 
 return df


df = pd.read_csv('data/training_data/data.csv')
print(f"Loaded {df.shape} data/training_data/data.csv")
nrows = None
max_len = 32
out_dir = 'data/training_data/bytes_features'
df_res = prepare_features(df, out_dir=out_dir, max_len=max_len, nrows=nrows)

Loaded (12310512, 2) data/training_data/data.csv
df.shape=(12310512, 2) max_len=32 nrows=None 
Start with 12,310,512 samples
After dropna 12,310,509 samples
Saved 12,310,509 data/training_data/bytes_features/data.csv
time: 1min 6s (started: 2022-09-23 11:33:02 -07:00)


### training

In [9]:
size = df_res.shape[0]
train_size, dev_size, test_size = int(size*0.8), int(size*0.9), size
print(f"{train_size:.=}, {dev_size:.=}, {test_size:.=}")
X_train, y_train = np.array(list(df_res.loc[0:train_size, 'features'])), df_res.loc[0:train_size, 'y'] 
X_dev, y_dev = np.array(list(df_res.loc[train_size:dev_size, 'features'])), df_res.loc[train_size:dev_size, 'y']
X_test, y_test, domain_test = np.array(list(df_res.loc[dev_size:, 'features'])), df_res.loc[dev_size:, 'y'], df_res.loc[size*2:size*3, 'domain']

model = CatBoostClassifier(loss_function='MultiClass') # Logloss, MultiClass

model.fit(X_train, y_train, eval_set=(X_dev, y_dev), verbose=False,
 plot=True)

# train_size=799999, dev_size=899999, test_size=999999
# time: 53.3 s (started: 2022-09-23 08:57:16 -07:00)

9848407, 11079458, 12310509


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f0eaa606a90>

time: 13min 6s (started: 2022-09-23 11:34:43 -07:00)


### Results

In [10]:
y_pred = model.predict_proba(X_test)
y_scores = [y[1] for y in y_pred]

results(y_test, y_scores, y_pred=None)

# support: 1231054 (len = 26, iterations = 1400)
# average_precision_score: 0.964
# roc_auc_score: 0.973
# classification_report: 
# precision recall f1-score support
# 0 0.92 0.97 0.95 775350
# 1 0.95 0.86 0.90 455704
# accuracy 0.93 1231054
# macro avg 0.93 0.92 0.92 1231054
# weighted avg 0.93 0.93 0.93 1231054

# support: 1231054 (len = 26)
# average_precision_score: 0.961
# roc_auc_score: 0.971
# classification_report: 
# precision recall f1-score support
# 0 0.92 0.97 0.94 775350
# 1 0.94 0.86 0.90 455704
# accuracy 0.93 1231054
# macro avg 0.93 0.91 0.92 1231054
# weighted avg 0.93 0.93 0.93 1231054

# support: 1_231_054 
# average_precision_score: 0.958
# roc_auc_score: 0.968
# classification_report: 
# precision recall f1-score support
# 0 0.91 0.97 0.94 775350
# 1 0.94 0.84 0.89 455704
# accuracy 0.92 1231054
# macro avg 0.93 0.91 0.92 1231054
# weighted avg 0.92 0.92 0.92 1231054

# support: 100001
# average_precision_score: 0.957
# roc_auc_score: 0.966
# classification_report: 
# precision recall f1-score support
# 0 0.91 0.97 0.94 62847
# 1 0.94 0.84 0.89 37154
# accuracy 0.92 100001
# macro avg 0.93 0.90 0.91 100001
# weighted avg 0.92 0.92 0.92 100001

# support: 10000
# average_precision_score: 0.946
# roc_auc_score: 0.958
# classification_report: 
# precision recall f1-score support
# 0 0.89 0.97 0.93 6222
# 1 0.94 0.81 0.87 3778
# accuracy 0.91 10000
# macro avg 0.92 0.89 0.90 10000
# weighted avg 0.91 0.91 0.91 10000
# time: 77.5 ms (started: 2022-09-23 08:52:08 -07:00)

support: 1231054
average_precision_score: 0.962
roc_auc_score: 0.972
classification_report: 
 precision recall f1-score support

 0 0.92 0.97 0.94 775350
 1 0.94 0.86 0.90 455704

 accuracy 0.93 1231054
 macro avg 0.93 0.91 0.92 1231054
weighted avg 0.93 0.93 0.93 1231054

time: 6.63 s (started: 2022-09-23 11:47:50 -07:00)


In [11]:
aps = average_precision_score(y_test, y_scores)
out_file_name = f'models/catboost.{aps:.3}.{max_len}_bytes.model'

model.save_model(out_file_name)
print(f'Saved {out_file_name}')

Saved models/catboost.0.962.32_bytes.model
time: 239 ms (started: 2022-09-23 11:47:57 -07:00)


## Ensemble: token-based and bytes-based

### preprocessing

In [3]:
# Run it once when prepare a data file. Otherwise, be aware it is not fast!

from transformers import DistilBertTokenizer
from collections import Counter

def extract_old_features(s: str, max_length: int = 14, tokenizer=None) -> List[int]:

 d = dict(
 Counter(
 [
 len(t.replace("##", ""))
 for t in tokenizer.convert_ids_to_tokens(tokenizer(s)["input_ids"])
 if t not in ["[CLS]", "[SEP]", "."]
 ]
 )
 )
 features = [d[i] if i in d else 0 for i in range(1, 19)]
 features_cut = features[: max_length - 1] + [sum(features[max_length:])]
 return features_cut

# def prepare_old_features(df, nrows=None):
# tokenizer = DistilBertTokenizer.from_pretrained("tokenizer/")
# df = df[:nrows].copy(deep=True)
# df['features'] = df['domain'].apply(lambda d: extract_features(d, max_length=14, tokenizer=tokenizer))
# df['y'] = df['label'].apply(lambda d: 0 if d == 'bening' else 1)
# out_file = "data/training_data/old_features/data.csv"
# df.to_csv(out_file, index=False)
# print(f"Saved {df.shape[0]:,} {out_file}") 
# return df


def extract_new_features(s: str, pad_char: str = "=", max_len=20) -> List[int]:
 # encode to the byte array:
 if isinstance(s, bytes):
 s = s.decode()
 elif not isinstance(s, (str, bytes)):
 raise ValueError(s, "Only str or bytearray type can be processed by the _extract_features().")

 s_len = len(s)
 # cut or pad a string
 if s_len < max_len:
 s = s + pad_char * (max_len - s_len)
 elif s_len > max_len:
 delta = (s_len - max_len) // 2
 s = s[delta : (max_len + delta)]

 return [ord(b) for b in s]

def extract_features(s: str, pad_char: str = "=", max_len=20, max_token_num: int = 14, tokenizer=None):
 return (extract_old_features(s, max_length=max_token_num, tokenizer=tokenizer)
 + extract_new_features(s, max_len=max_len))
 

def prepare_features(df, out_dir=None, max_len=20, nrows=None, save=False):
 print(f"{df.shape=} {max_len=} {nrows=} ")
 df = df[:nrows].copy(deep=True)
 print(f"Start with {df.shape[0]:,} samples")
 df = df.dropna()
 print(f"After dropna {df.shape[0]:,} samples")
 tokenizer = DistilBertTokenizer.from_pretrained("models/tokenizer/")
 df['features'] = df['domain'].apply(lambda d: extract_features(
 d, max_len=max_len,
 # max_token_num=max_token_num,
 tokenizer=tokenizer
 ))
 df['y'] = df['label'].apply(lambda d: 0 if d == 'bening' else 1)
 if save:
 out_file = f"{out_dir}/data.csv"
 df.to_csv(out_file, index=False)
 print(f"Saved {df.shape[0]:,} {out_file}") 
 print(f"Result {df.shape[0]:,}")
 return df


df = pd.read_csv('data/training_data/data.csv')
print(f"Loaded {df.shape} data/training_data/data.csv")
nrows = None
max_len = 26
out_dir = 'data/training_data/ensemble_features'
df_res = prepare_features(df, out_dir=out_dir, max_len=max_len, nrows=nrows, save=False)


Loaded (12310512, 2) data/training_data/data.csv
df.shape=(12310512, 2) max_len=26 nrows=None 
Start with 12,310,512 samples
After dropna 12,310,509 samples


KeyboardInterrupt: 

time: 12min 39s (started: 2022-09-26 11:22:20 -07:00)


### training

In [2]:
import numpy as np

def cast_to_array(s):
 return np.array([int(el) for el in s[1:-1].split(", ")])

# s = "[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 121, 97, 114, 100, 122, 101, 110, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61]"
# cast_to_array(s)

time: 1.12 ms (started: 2022-09-26 11:37:08 -07:00)


In [3]:
# Run it OLY if you didn't get the df_res in the "preprocessing" before!
# To debug it decrease nrows!. You save time on training on a small dataset:
in_file = "data/training_data/ensemble_features/data.csv"
nrows = None
df_res = pd.read_csv(in_file, nrows=nrows)
df_res['features'] = df_res['features'].apply(cast_to_array)
df_res.shape, df_res[:1]

((12310509, 4),
 domain label features y
 0 yardzen bening [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 121... 0)

time: 1min 13s (started: 2022-09-26 11:37:11 -07:00)


In [4]:
size = df_res.shape[0]
train_size, dev_size, test_size = int(size*0.8), int(size*0.9), size
print(f"train: {train_size:,}, dev: {dev_size:,}, test: {test_size:,}")
X_train, y_train = np.array(list(df_res.loc[0:train_size, 'features'])), df_res.loc[0:train_size, 'y'] 
X_dev, y_dev = np.array(list(df_res.loc[train_size:dev_size, 'features'])), df_res.loc[train_size:dev_size, 'y']
# X_test, y_test, domain_test = np.array(list(df_res.loc[dev_size:, 'features'])), df_res.loc[dev_size:, 'y'], df_res.loc[size*2:size*3, 'domain']

print("Start training...")
model = CatBoostClassifier(loss_function='MultiClass') # Logloss, MultiClass

model.fit(X_train, y_train, eval_set=(X_dev, y_dev), verbose=False,
 plot=True)

# train_size=799999, dev_size=899999, test_size=999999
# time: 53.3 s (started: 2022-09-23 08:57:16 -07:00)

train: 9,848,407, dev: 11,079,458, test: 12,310,509
Start training...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7fd81043cb20>

time: 14min 7s (started: 2022-09-26 11:40:42 -07:00)


### Results

In [7]:
X_test, y_test = np.array(list(df_res.loc[dev_size:, 'features'])), df_res.loc[dev_size:, 'y'] # , df_res.loc[size*2:size*3, 'domain']

y_pred = model.predict_proba(X_test)
y_scores = [y[1] for y in y_pred]

results(y_test, y_scores, y_pred=None)


support: 1231051
average_precision_score: 0.977
roc_auc_score: 0.985
classification_report: 
 precision recall f1-score support

 0 0.95 0.97 0.96 775348
 1 0.94 0.91 0.93 455703

 accuracy 0.95 1231051
 macro avg 0.95 0.94 0.94 1231051
weighted avg 0.95 0.95 0.95 1231051

time: 8.03 s (started: 2022-09-26 12:26:28 -07:00)


In [None]:
# support: 1231051
# average_precision_score: 0.977
# roc_auc_score: 0.985
# classification_report: 
# precision recall f1-score support
# 0 0.95 0.97 0.96 775348
# 1 0.94 0.91 0.93 455703
# accuracy 0.95 1231051
# macro avg 0.95 0.94 0.94 1231051
# weighted avg 0.95 0.95 0.95 1231051

# support: 100000
# average_precision_score: 0.976
# roc_auc_score: 0.983
# classification_report: 
# precision recall f1-score support
# 0 0.94 0.97 0.96 62848
# 1 0.94 0.90 0.92 37152
# accuracy 0.94 100000
# macro avg 0.94 0.94 0.94 100000
# weighted avg 0.94 0.94 0.94 100000

# support: 10000
# average_precision_score: 0.971
# roc_auc_score: 0.979
# classification_report: 
# precision recall f1-score support
# 0 0.93 0.96 0.95 6222
# 1 0.94 0.89 0.91 3778
# accuracy 0.94 10000
# macro avg 0.94 0.93 0.93 10000
# weighted avg 0.94 0.94 0.93 10000

In [9]:
max_len = 26

aps = average_precision_score(y_test, y_scores)
out_file_name = f'models/catboost.{aps:.3}.{max_len}_ensemble.model'

model.save_model(out_file_name)
print(f'Saved {out_file_name}')

Saved models/catboost.0.977.26_ensemble.model
time: 286 ms (started: 2022-09-26 12:28:09 -07:00)


## Additional analysis

### feature importance

In [12]:
model_file = "models/catboost.0.977.26_ensemble.model"

model = CatBoostClassifier().load_model(model_file)
len(model.get_feature_importance()), [int(f*100) for f in model.get_feature_importance()], sum([int(f*100) for f in model.get_feature_importance()])

(40,
 [1457,
 1293,
 311,
 373,
 215,
 112,
 75,
 42,
 33,
 19,
 11,
 7,
 3,
 0,
 816,
 383,
 328,
 461,
 351,
 592,
 507,
 433,
 433,
 291,
 285,
 263,
 156,
 82,
 50,
 113,
 122,
 54,
 10,
 3,
 20,
 7,
 5,
 10,
 20,
 237],
 9983)

time: 25 ms (started: 2022-09-26 15:16:28 -07:00)
