train script; update configs

spencerbraun · spencerbraun · commit 01e008048a54 · 2021-11-09T20:30:28.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+.vscode
diff --git a/README.md b/README.md
@@ -1 +1,36 @@
-# anomaly_transformer_pytorch
+# Anomaly Transformer in PyTorch
+
+This is an implementation of [Anomaly Transformer: Time Series Anomaly Detection with Association Discrepancy](https://arxiv.org/abs/2110.02642). This paper is currently [under review](https://openreview.net/forum?id=LzQQ89U1qm_) and in need of some clarification around the attention mechanism. This repo will be updated as more information is provided.
+
+## Usage
+
+### Requirements
+
+Install dependences into a virtualenv:
+
+```bash
+$ python -m venv env
+$ source env/bin/activate
+(env) $ pip install -r requirements.txt
+```
+
+### Data and Configuration
+
+Custom datasets can be placed in the `data/` dir. Edits should be made to the `conf/data/default.yaml` file to reflect the correct properties of the data. All other configuration hyperparameters can be set in the hydra configs.
+
+### Train
+
+Once properly configured, a model can be trained via `python train.py`.
+
+## Citations
+
+```bibtex
+@misc{xu2021anomaly,
+      title={Anomaly Transformer: Time Series Anomaly Detection with Association Discrepancy},
+      author={Jiehui Xu and Haixu Wu and Jianmin Wang and Mingsheng Long},
+      year={2021},
+      eprint={2110.02642},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
diff --git a/archive/exploration.ipynb b/archive/exploration.ipynb
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -1,7 +1,51 @@
 defaults:
-  - model: transformers
+  - model: default
+  - data: default
   - evaluate: default
   - train: default
 
+seed: 0
+debug: False
+silent: False
+device: cuda
+
+max_iters: 1000000
+log_interval: 100
+val_interval: 5000
+model_save_pt: 5000
+
+lr: 1e-5
+batch_size: 32
+val_steps: 500
+grad_clip: 100.
+early_stop_patience: 20000
+early_stop_key: "loss/total_edit_val"
+dropout: 0.0
+results_dir: null
+
+eval_only: False
+half: False
+save: False
+
+model:
+  pt: null
+
+data:
+  path: null
+  rephrase: true
+  zsre_nq: true
+  nq_path: ${hydra:runtime.cwd}/data/nq
+  wiki_webtext: true
+  n_edits: 1
+
+eval:
+  verbose: True
+  log_interval: 100
+  final_eval: True
+
 hydra:
-  auto: False
+  run:
+    dir: ./outputs/${now:%Y-%m-%d_%H-%M-%S_%f${uuid:}}
+  sweep:
+    dir: ./outputs/${now:%Y-%m-%d_%H-%M-%S_%f}
+    subdir: ${hydra.job.num}
diff --git a/model.py b/model.py
@@ -8,10 +8,11 @@
 
 
 class AnomalyAttention(nn.Module):
-    def __init__(self, seq_dim, channels):
+    def __init__(self, seq_dim, in_channels, out_channels):
         super(AnomalyAttention, self).__init__()
-        self.Q = self.K = self.V = self.sigma = torch.zeros((seq_dim, channels))
-        self.d_model = channels
+        self.W = nn.Linear(in_channels, out_channels, bias=False)
+        self.Q = self.K = self.V = self.sigma = torch.zeros((seq_dim, out_channels))
+        self.d_model = out_channels
         self.n  = seq_dim
         self.P = torch.zeros((seq_dim, seq_dim))
         self.S = torch.zeros((seq_dim, seq_dim))
@@ -21,37 +22,43 @@ def forward(self, x):
         self.initialize(x)
         self.P = self.prior_association()
         self.S = self.series_association()
-        print(self.S.shape)
-        # assert self.S.shape == (self.n, self.n)
         Z = self.reconstruction()
 
         return Z
 
     def initialize(self, x):
         # self.d_model = x.shape[-1]
-        self.Q = self.K = self.V = self.sigma = x
+        self.Q = self.K = self.V = self.sigma = self.W(x)
+        
 
     def prior_association(self):
-        return torch.ones((self.n, self.n))
+        p = torch.from_numpy(
+            np.abs(
+                np.indices((self.n,self.n))[0] - 
+                np.indices((self.n,self.n))[1]
+                )
+            )
+        gaussian = torch.normal(p.float(), self.sigma[:,0].abs())
+        gaussian /= gaussian.sum(dim=-1).view(-1, 1)
+
+        return gaussian
 
     def series_association(self):
-        print(self.Q.shape)
-        print(self.K.shape)
         return F.softmax((self.Q @ self.K.T) / math.sqrt(self.d_model), dim=0)
 
     def reconstruction(self):
         return self.S @ self.V
 
     def association_discrepancy(self):
-        return F.kl_div(self.P, self.S) + F.kl_div(self.S, self.P) #not going to be correct dimensions
+        return F.kl_div(self.P, self.S) + F.kl_div(self.S, self.P)
 
 
 class AnomalyTransformerBlock(nn.Module):
     def __init__(self, seq_dim, feat_dim):
         super().__init__()
         self.seq_dim, self.feat_dim = seq_dim, feat_dim
        
-        self.attention = AnomalyAttention(self.seq_dim, self.feat_dim)
+        self.attention = AnomalyAttention(self.seq_dim, self.feat_dim, self.feat_dim)
         self.ln1 = nn.LayerNorm(self.feat_dim)
         self.ff = nn.Sequential(
             nn.Linear(self.feat_dim, self.feat_dim),
@@ -94,7 +101,7 @@ def forward(self, x):
 
     def loss(self, x):
         l2_norm = torch.linalg.matrix_norm(self.output - x, ord=2)
-        return l2_norm + (lambda_ * self.assoc_discrepancy)
+        return l2_norm + (self.lambda_ * self.assoc_discrepancy.mean())
 
     def anomaly_score(self, x):
         score = F.softmax(-self.assoc_discrepancy, dim=0)
diff --git a/train.py b/train.py
@@ -0,0 +1,112 @@
+import logging
+from datetime import datetime
+
+import numpy as np
+import torch
+import wandb
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+
+import hydra
+from omegaconf import DictConfig
+from omegaconf.omegaconf import OmegaConf
+from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
+
+from model import AnomalyTransformer
+
+logger = logging.getLogger(__name__)
+
+
+def train(config, model, train_data, val_data):
+
+    train_dataloader = DataLoader(
+        train_data,
+        batch_size=config.train.batch_size,
+        shuffle=config.train.shuffle,
+        # collate_fn=collate_fn,
+        drop_last=True,
+    )
+    total_steps = int(len(train_dataloader) * config.train.epochs)
+    warmup_steps = max(int(total_steps * config.train.warmup_ratio), 200)
+    optimizer = AdamW(
+        model.parameters(),
+        lr=config.train.lr,
+        eps=config.train.adam_epsilon,
+    )
+    scheduler = get_cosine_schedule_with_warmup(
+        optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
+    )
+    print("Total steps: {}".format(total_steps))
+    print("Warmup steps: {}".format(warmup_steps))
+
+    num_steps = 0
+    best_f1 = 0
+    model.train()
+
+    for epoch in range(int(config.train.epochs)):
+        model.zero_grad()
+        for step, batch in enumerate(tqdm(train_dataloader)):
+
+            outputs = model(**inputs)
+            loss = outputs.loss()
+            loss.backward()
+
+            torch.nn.utils.clip_grad_norm_(
+                model.parameters(), config.train.max_grad_norm
+            )
+            optimizer.step()
+            scheduler.step()
+            model.zero_grad()
+
+            num_steps += 1
+
+            if not config.debug:
+                wandb.log({"loss": loss.item()}, step=num_steps)
+
+        output = validate(config, model, val_data)
+        if not config.debug:
+            wandb.log(output, step=num_steps)
+
+            if output["validation_f1"] > best_f1:
+                print(f"Best validation F1! Saving to {config.train.pt}")
+                torch.save(model.state_dict(), config.train.pt)
+
+        best_f1 = max(best_f1, output["validation_f1"])
+
+
+def validate(config, model, data):
+    return 0
+
+
+@hydra.main(config_path="./conf", config_name="config")
+def main(config: DictConfig) -> None:
+
+    set_seed(config.train.state.seed)
+
+    logger.info(OmegaConf.to_yaml(config, resolve=True))
+    logger.info(f"Using the model: {config.model.name}")
+
+    train_data, val_data = get_data(config)
+    config.data.num_class = len(set([x["labels"] for x in train_features]))
+    print(f"num_class: {config.data.num_class}")
+
+    if not config.debug:
+        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+        run_name = f"{config.train.wandb.run_name}_{config.model.model}_{config.data.name}_{timestamp}"
+        wandb.init(
+            entity=config.train.wandb_entity,
+            project=config.train.wandb_project,
+            config=dict(config),
+            name=run_name,
+        )
+        if not config.train.pt:
+            config.train.pt = f"{config.train.pt}/{run_name}"
+
+    model = AnomalyTransformer(config)
+    model.to(config.device)
+
+    train(config, model, train_data, val_data)
+
+
+if __name__ == "__main__":
+    main()

-Original file line number
+Diff line change
 # Pyre type checker
 .pyre/
++
 +.vscode