2022-11-30 15:07:56 +01:00
from io import StringIO
2022-11-22 16:26:31 +01:00
from pathlib import Path
import json
2022-11-22 16:30:07 +01:00
import itertools
2022-11-22 16:27:23 +01:00
import yaml
import jsonschema
2022-12-13 17:51:01 +01:00
from typing import Any, Dict, List, Literal, NotRequired, Optional, TypedDict
2022-11-22 16:26:31 +01:00
import requests
2022-11-22 16:27:23 +01:00
from yachalk import chalk
yachalk_imported = True
except ModuleNotFoundError:
yachalk_imported = False
2022-11-22 16:26:31 +01:00
dataset_path = Path('dataset')
2022-11-22 16:30:07 +01:00
output_path = Path('pages')
dataset_info = dataset_path / Path('dataset.json')
2022-11-22 16:26:31 +01:00
token = "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"
2022-11-22 16:30:07 +01:00
def error(msg: str) -> Exception:
print(chalk.red(msg) if yachalk_imported else "Error: {}".format(msg))
return Exception(msg)
2022-11-22 16:27:23 +01:00
def warning(msg: str):
2022-11-22 16:30:07 +01:00
print(chalk.yellow(msg) if yachalk_imported else "Warning: {}".format(msg))
2022-11-22 16:27:23 +01:00
2022-11-23 13:00:38 +01:00
class License(TypedDict):
key: str
name: str
spdx_id: str
url: str
node_id: str
class Permissions(TypedDict):
admin: bool
maintain: bool
push: bool
triage: bool
pull: bool
class Owner(TypedDict):
login: str
id: int
node_id: str
avatar_url: str
gravatar_id: str
url: str
html_url: str
followers_url: str
following_url: str
gists_url: str
starred_url: str
subscriptions_url: str
organizations_url: str
repos_url: str
events_url: str
received_events_url: str
type: str
site_admin: bool
name: NotRequired[str]
company: NotRequired[Optional[str]]
blog: NotRequired[str]
location: NotRequired[Optional[str]]
email: NotRequired[Optional[str]]
hireable: NotRequired[Optional[bool]]
bio: NotRequired[Optional[str]]
twitter_username: NotRequired[Optional[str]]
public_repos: NotRequired[int]
public_gists: NotRequired[int]
followers: NotRequired[int]
following: NotRequired[int]
created_at: NotRequired[str]
updated_at: NotRequired[str]
class GithubRepositoryInformation(TypedDict):
id: int
node_id: str
name: str
full_name: str
private: bool
owner: Owner
html_url: str
description: Optional[str]
fork: bool
url: str
forks_url: str
keys_url: str
collaborators_url: str
teams_url: str
hooks_url: str
issue_events_url: str
events_url: str
assignees_url: str
branches_url: str
tags_url: str
blobs_url: str
git_tags_url: str
git_refs_url: str
trees_url: str
statuses_url: str
languages_url: str
stargazers_url: str
contributors_url: str
subscribers_url: str
subscription_url: str
commits_url: str
git_commits_url: str
comments_url: str
issue_comment_url: str
contents_url: str
compare_url: str
merges_url: str
archive_url: str
downloads_url: str
issues_url: str
pulls_url: str
milestones_url: str
notifications_url: str
labels_url: str
releases_url: str
deployments_url: str
created_at: str
updated_at: str
pushed_at: str
git_url: str
ssh_url: str
clone_url: str
svn_url: str
homepage: Optional[str]
size: int
stargazers_count: int
watchers_count: int
language: str
has_issues: bool
has_projects: bool
has_downloads: bool
has_wiki: bool
has_pages: bool
forks_count: int
mirror_url: None
archived: bool
disabled: bool
open_issues_count: int
license: Optional[License]
allow_forking: bool
is_template: bool
web_commit_signoff_required: bool
topics: List[str]
visibility: str
forks: int
open_issues: int
watchers: int
default_branch: str
permissions: Permissions
temp_clone_token: str
organization: NotRequired[Owner]
network_count: int
subscribers_count: int
class ModelInformation(TypedDict):
title: NotRequired[str]
slug: str
branch: NotRequired[str]
data: GithubRepositoryInformation
owner: Owner
stars: int
forks: int
owner_name: str
owner_slug: str
s: int
e: int
i: int
a: int
t: int
l: int
tech: List[str]
Dataset = dict[str, ModelInformation]
def open_dataset() -> Dataset:
2022-11-22 16:26:31 +01:00
with open(dataset_info, 'r') as f:
return json.load(f)
2022-11-23 13:00:38 +01:00
def save_dataset(dataset: Dataset):
2022-11-22 16:26:31 +01:00
with open(dataset_info, 'w') as f:
json.dump(dataset, f, indent=4)
def get_json(uri: str):
resp = requests.get(url=uri, headers={"Authorization": f"Bearer {token}"})
if not resp.ok:
2022-11-22 16:27:23 +01:00
resp_error = resp.json()['message']
2022-11-22 16:26:31 +01:00
except Exception:
2022-11-22 16:27:23 +01:00
resp_error = resp.text
raise Exception(f"Invalid response: {resp_error}")
2022-11-22 16:26:31 +01:00
return resp.json()
def get_repo(slug: str):
return get_json(f"https://api.github.com/repos/{slug}")
def get_user(name: str):
return get_json(f"https://api.github.com/users/{name}")
def get_file(slug: str, path: str):
return get_json(f"https://api.github.com/repos/{slug}/contents/{path}")
def plural(amount: int, name: str, plural: str = 's'):
return f"{amount} {name}{plural[:amount^1]}"
2022-11-22 16:31:20 +01:00
from typing import TypedDict
2022-11-22 16:27:23 +01:00
2022-11-22 16:31:20 +01:00
class Artifact(TypedDict):
2022-11-22 16:27:23 +01:00
file: str
2022-12-13 14:50:02 +01:00
lines: NotRequired[list[int]]
2022-12-13 15:48:29 +01:00
repository: NotRequired[str]
2022-11-22 16:27:23 +01:00
2022-12-13 17:51:01 +01:00
RuleStatus = Literal["disregarded", "observed", "not applicable", "unknown"]
2022-11-22 16:31:20 +01:00
class SecurityRule(TypedDict):
2022-12-13 17:51:01 +01:00
status: RuleStatus
2022-12-13 14:50:02 +01:00
argument: str | list[str]
2022-11-23 13:00:38 +01:00
artifacts: NotRequired[list[Artifact]]
2022-11-22 16:27:23 +01:00
rule_schema = yaml.safe_load("""type: object
additionalProperties: no
- status
- argument
type: string
- disregarded
2022-11-22 16:30:07 +01:00
- observed
2022-11-22 16:27:23 +01:00
- not applicable
- unknown
2022-12-13 14:50:02 +01:00
- type: string
- type: array
type: string
2022-11-22 16:27:23 +01:00
type: array
2022-12-13 14:50:02 +01:00
additionalProperties: no
- file
2022-11-22 16:27:23 +01:00
type: object
type: string
2022-12-13 15:48:29 +01:00
type: string
2022-11-22 16:27:23 +01:00
type: array
type: integer""")
2022-11-22 16:30:44 +01:00
def check_security_rules(security_rules: dict[Any, Any] | None) -> dict[int, SecurityRule]:
if security_rules is None:
raise Exception("Security rules file is empty!")
2022-11-22 16:27:23 +01:00
for n in range(1, 19):
rule = security_rules.get(n, None)
2022-11-22 16:31:20 +01:00
if rule is None: raise jsonschema.ValidationError(f"Rule {n} is not evaluated")
2022-11-22 16:27:23 +01:00
jsonschema.validate(rule, rule_schema)
2022-11-22 16:31:20 +01:00
rule: SecurityRule
if rule["status"] == "unknown":
warning(f"Rule {n} is still unknown!")
2022-11-22 16:27:23 +01:00
except jsonschema.ValidationError as e:
warning("Not checking further rules!")
2022-11-23 13:00:38 +01:00
raise Exception("Security rule {n}: {msg} at $.{n}.{path}".format(n=n, msg=e.message, path=e.json_path)) from e
return dict(sorted(security_rules.items()))
2022-11-22 16:27:23 +01:00
2022-11-22 16:26:31 +01:00
update_dataset = False
def get_name(slug: str):
return slug[slug.find('/')+1:]
2022-11-22 16:30:07 +01:00
def get_tag_slug(tag: str) -> str:
return tag.lower().replace(' ', '_')
2022-11-23 13:00:38 +01:00
rule_names = {
1: "API Gateway",
2: "Mutual Authentication",
3: "Decoupled Authentication",
4: "Internal Identity Represenation",
5: "Authentication Token Validation",
6: "Login Rate Limiting",
7: "Edge Encryption",
8: "Internal Encryption",
9: "Central Logging Subsystem",
10: "Local Logging Agent",
11: "Log Sanitization",
12: "Log Message Broker",
13: "Circuit Breaker",
14: "Load Balancing",
15: "Service Mesh Usage Limits",
16: "Service Registry Deployment",
17: "Service Registry Validation",
18: "Secret Manager",
def artifact_to_string(info: ModelInformation, artifact: Artifact):
file = Path(artifact['file'])
filename = file.name
2022-12-13 15:48:29 +01:00
file_url = f"https://github.com/{artifact.get('repository', info['slug'])}/blob/{info.get('branch', 'master')}/{artifact['file']}"
2022-12-13 14:50:02 +01:00
lines = artifact.get("lines")
if lines is None:
return f"- {filename}: [File]({file_url})"
return f"- {filename}: Line{'s'[:len(lines)^1]}: {', '.join(f'[{line}]({file_url}#L{line})' for line in lines)}"
2022-11-23 13:00:38 +01:00
def rule_to_string(info: ModelInformation, id: int, rule: SecurityRule | None):
if rule is None:
2022-12-13 14:50:02 +01:00
warning(f"Rule {id} is missing!")
2022-11-23 13:00:38 +01:00
return ""
2022-12-13 14:50:02 +01:00
argument = rule['argument']
argument = argument if isinstance(argument, str) else "".join(f"\n1. {arg}" for arg in argument)
2022-12-13 17:51:01 +01:00
text = f"""#### Rule {id}: {rule_names[id]} {{#rule{id:02}}}
2022-11-23 13:00:38 +01:00
2022-12-13 14:50:02 +01:00
This rule is {rule['status']}: {argument}"""
2022-11-23 13:00:38 +01:00
artifacts = rule.get("artifacts", [])
if len(artifacts) > 0:
text = text + f"""
{chr(10).join(artifact_to_string(info, artifact) for artifact in artifacts)}"""
return text
def write_security_rules(info: ModelInformation, security_rules: dict[int, SecurityRule]):
2022-12-13 17:51:01 +01:00
icons: Dict[RuleStatus | str, str] = {
'disregarded': '<i class="fa fa-exclamation-circle" style="color: #d72b28;"></i>',
'observed': '<i class="fa fa-check-square-o" style="color: #6be16d;"></i>',
'not applicable': '<i class="fa fa-info-circle" style="color: #31708;"></i>',
'unknown': '<i class="fa fa-warning" style="color: #bfc600;"></i>',
2022-11-23 13:00:38 +01:00
return f"""## Security Rules
2022-12-13 17:51:01 +01:00
{" | ".join(f"R{i}" for i in range(1, 19))}
{" | ".join("--" for _ in range(1, 19))}
{" | ".join(f'<a href="#rule{i:02}">{icons[security_rules.get(i, {"status": "unknown"})["status"]]}</a>' for i in range(1, 19))}
2022-11-23 13:00:38 +01:00
### Authentication / Authorization
{(chr(10)*2).join(rule_to_string(info, i, security_rules.get(i)) for i in range(1, 7))}
### Encryption
{(chr(10)*2).join(rule_to_string(info, i, security_rules.get(i)) for i in range(7, 9))}
### Logging
{(chr(10)*2).join(rule_to_string(info, i, security_rules.get(i)) for i in range(9, 13))}
### Availability
{(chr(10)*2).join(rule_to_string(info, i, security_rules.get(i)) for i in range(13, 16))}
### Service Registry
{(chr(10)*2).join(rule_to_string(info, i, security_rules.get(i)) for i in range(16, 18))}
### Secret Management
{(chr(10)*2).join(rule_to_string(info, i, security_rules.get(i)) for i in range(18, 19))}"""
2022-11-30 15:07:56 +01:00
def write_file_if_changed(file: Path, content: str, encoding: str = "utf-8"):
old_content = None
if file.exists():
with file.open('r', encoding=encoding) as f:
old_content = f.read()
if old_content is None or old_content != content:
print(f"Writing changed file: {file}")
with file.open('w', encoding=encoding) as f:
2022-11-23 13:00:38 +01:00
def write_model_readmes(dataset: Dataset):
2022-11-22 16:26:31 +01:00
for model_id, info in dataset.items():
2022-11-22 16:30:07 +01:00
dir = output_path / 'dataset'
readme = dir / f'{model_id}.md'
2022-11-23 13:00:38 +01:00
slug = info['slug']
2022-11-22 16:26:31 +01:00
data = info.get('data')
if not data:
data = get_repo(slug)
info['data'] = data
owner_url = data.get('owner', {}).get('url')
if not owner_url:
raise Exception(f'No owner in repo {slug}!')
owner = info.get('owner')
if not owner:
owner = get_json(owner_url)
info['owner'] = owner
owner_name = owner.get('name')
if not owner_name:
raise Exception(f'No owner name in repo {slug}!')
stars = data['stargazers_count']
forks = data['forks']
owner_slug = owner['login']
info['stars'] = stars
info['forks'] = forks
info['owner_name'] = owner_name
info['owner_slug'] = owner_slug
2022-11-23 13:07:21 +01:00
model_path = dataset_path / model_id
security_rules_file = model_path / 'security_rules.yaml'
model_file = model_path / f"{model_id}.py"
with model_file.open("r") as f:
model = f.read()
2022-11-30 14:31:32 +01:00
security_rules = None
2022-11-22 16:27:23 +01:00
2022-11-23 13:07:21 +01:00
with security_rules_file.open('r') as f:
2022-11-30 14:31:32 +01:00
security_rules = check_security_rules(yaml.safe_load(f))
2022-11-22 16:27:23 +01:00
except FileNotFoundError:
warning("Security rules file not found at {}".format(security_rules_file))
2022-11-22 16:30:44 +01:00
except Exception as e:
warning("Security rules file at {} is invalid: {}".format(security_rules_file, e))
2022-11-22 16:30:07 +01:00
2022-11-30 15:07:56 +01:00
write_file_if_changed(readme, f"""---
2022-11-22 16:30:07 +01:00
title: {slug}
keywords: model TODO
tags: [{', '.join(get_tag_slug(tech) for tech in info['tech'])}]
sidebar: datasetdoc_sidebar
permalink: {model_id}.html
2022-11-23 13:00:38 +01:00
toc: false
2022-11-22 16:30:07 +01:00
2022-11-22 16:26:31 +01:00
## Repository Information
Repository: [GitHub](https://github.com/{slug})
Owner: [{owner_name}](https://github.com/{owner_slug})
The repository has {plural(stars, 'star')} and was forked {plural(forks, 'time')}. The codebase consists of {plural(info['l'], 'line')} of code and makes use of the following technologies:
2022-11-23 11:48:09 +01:00
{chr(10).join(f'<a class="btn btn-primary" style="margin-bottom: 5px" role="button" href="tag_{get_tag_slug(tech)}.html">{tech}</a>' for tech in info['tech'])}
2022-11-22 16:26:31 +01:00
## Data Flow Diagram
2022-11-23 11:48:09 +01:00
### DFD Model
{{% include note.html content="Download the [model file](../../dataset/{model_id}/{model_id}.py)" %}}
The images below were generated by executing the model file. The DFD is represented as a CodeableModels file.
2022-11-23 13:07:21 +01:00
2022-11-22 16:26:31 +01:00
### Statistics
The Application consists of a total of {plural(info['t'], 'element')}:
Element | Count
-- | --
Services | {info['s']}
External Entities | {info['e']}
Information Flows | {info['i']}
Annotations | {info['a']}
Total Items | {info['t']}
### Diagram
2022-11-22 16:30:44 +01:00
- [PlantUML Model](../../dataset/{model_id}/{model_id}/{model_id}.txt)
- [SVG Vector Image](../../dataset/{model_id}/{model_id}/{model_id}.svg)
- [PNG Raster Image](../../dataset/{model_id}/{model_id}/{model_id}.png)
2022-11-22 16:26:31 +01:00
2022-11-23 11:49:26 +01:00

2022-11-23 13:00:38 +01:00
2022-11-30 14:31:32 +01:00
{"" if security_rules is None else write_security_rules(info, security_rules)}
2022-11-23 11:49:26 +01:00
2022-11-22 16:26:31 +01:00
2022-11-23 13:00:38 +01:00
def write_root_readme(dataset: Dataset):
2022-11-30 14:31:32 +01:00
overview_dir = output_path / 'overview'
index_file = Path('index.md')
2022-11-30 15:07:56 +01:00
write_file_if_changed(index_file, f"""---
2022-11-23 11:48:09 +01:00
title: code2DFD Documentation
keywords: code2DFD introduction
2022-11-30 14:31:32 +01:00
tags: [overview]
2022-11-22 16:30:07 +01:00
sidebar: datasetdoc_sidebar
permalink: index.html
summary: Dataset of dataflow diagrams of microservice applications.
2022-11-30 14:31:32 +01:00
toc: false
2022-11-22 16:30:07 +01:00
2022-11-30 15:07:56 +01:00
## DaFD
2022-11-23 11:48:09 +01:00
{{% include image.html file="TUHH_logo-wortmarke_en_rgb.svg" alt="TUHH Logo" max-width="350" %}}
{{% include image.html file="company_logo_big.png" alt="SoftSec Institute Logo" max-width="350" %}}
2022-11-30 14:31:32 +01:00
This is DaFD, a dataset containing Dataflow Diagrams (DFDs) of microservices written in Java. The models correspond to actual implementation code of open-source applications found on GitHub.
The DFDs are presented in multiple formats and contain full traceability of all model items to code, indicating the evidence for their implementation. Additionally to the models themselves, we present a mapping to a list of 17 architectural security best-practices, i.e. a table indicating whether each rules is followed or not. For those that are not followed, we created model variants that do follow the rule. These variants were crafted purely on the model-level and the added items do not correspond to code anymore. All artifacts were created manually by researchers of the Institute of Software Security at Hamburg University of Technology.
2022-11-23 11:48:09 +01:00
2022-11-30 15:07:56 +01:00
## Table of Contents
- [Overview](index.html)
- [Dataflow Diagrams](dfds.html)
- [Use-Cases](usecases.html)
- [Models](models.html)
2022-11-23 11:48:09 +01:00
2022-11-30 14:31:32 +01:00
models_file = overview_dir / 'models.md'
2022-11-30 15:07:56 +01:00
write_file_if_changed(models_file, f"""---
2022-11-30 14:31:32 +01:00
title: Models
2022-11-23 11:48:09 +01:00
keywords: dataset models
2022-11-30 14:31:32 +01:00
tags: [overview]
2022-11-23 11:48:09 +01:00
sidebar: datasetdoc_sidebar
2022-11-30 14:31:32 +01:00
permalink: models.html
2022-11-23 11:48:09 +01:00
summary: Dataset of dataflow diagrams of microservice applications.
2022-11-23 13:00:38 +01:00
datatable: true
2022-11-23 11:48:09 +01:00
2022-11-30 14:31:32 +01:00
The following table presents the models in this dataset. It shows some properties about their popularity and size of the models. Column `Source` links directly to the corresponding repository on GitHub. If you click on the name of an entry, you will be referred to the model and all artifacts.
2022-11-22 16:26:31 +01:00
2022-11-30 14:31:32 +01:00
Please select a model in column `Name`
2022-11-22 16:26:31 +01:00
2022-11-22 16:27:23 +01:00
<div class="datatable-begin"></div>
2022-11-22 16:26:31 +01:00
Name | Source | LoC | Stars | Forks | DFD Items | Technologies
-- | -- | -- | -- | -- | -- | --
2022-11-22 16:30:07 +01:00
{chr(10).join(f"[{info['slug']}]({model_id}.html) | [GitHub](https://github.com/{info['slug']}) | {info['l']} | {info['stars']} | {info['forks']} | {info['t']} | {len(info['tech'])}" for model_id, info in dataset.items())}
2022-11-22 16:27:23 +01:00
<div class="datatable-end"></div>
2022-11-22 16:26:31 +01:00
2022-11-23 13:00:38 +01:00
def write_tag_readme(dataset: Dataset):
2022-11-22 16:30:07 +01:00
tag_dir = output_path / 'tags'
known_tech = set(tech for model in dataset.values() for tech in model['tech'])
2022-11-30 15:07:56 +01:00
2022-11-23 11:48:09 +01:00
tags_data_path = Path('_data')
tags_data_file = tags_data_path / 'tags.yml'
2022-11-30 15:07:56 +01:00
if tags_data_file.exists():
tags_data_path.mkdir(exist_ok=True, parents=True)
with tags_data_file.open('r') as f:
tags: dict[Any, Any] = yaml.safe_load(f)
tags = {}
tags['allowed-tags'] = list(sorted(set(itertools.chain(tags.get('allowed-tags', []), (get_tag_slug(tech) for tech in known_tech)))))
with StringIO() as f:
2022-11-22 16:30:07 +01:00
yaml.dump(tags, f)
2022-11-30 15:07:56 +01:00
tags_content = f.getvalue()
write_file_if_changed(tags_data_file, tags_content)
2022-11-22 16:30:07 +01:00
for tech in known_tech:
slug = get_tag_slug(tech)
info_file = tag_dir / f'tag_{slug}.md'
2022-11-23 11:48:09 +01:00
tag_dir.mkdir(exist_ok=True, parents=True)
2022-11-30 15:07:56 +01:00
write_file_if_changed(info_file, f"""---
2022-11-22 16:30:07 +01:00
title: "{tech}"
tagName: {slug}
search: exclude
permalink: tag_{slug}.html
sidebar: datasetdoc_sidebar
2022-11-23 11:48:09 +01:00
hide_sidebar: true
2022-11-22 16:30:07 +01:00
folder: tags
{{% include taglogic.html %}}
{{% include links.html %}}
2022-11-22 16:26:31 +01:00
def main():
2022-11-22 16:30:07 +01:00
global known_tags
2022-11-22 16:26:31 +01:00
dataset = open_dataset()
2022-11-22 16:30:07 +01:00
2022-11-22 16:26:31 +01:00
2022-11-22 16:30:07 +01:00
if update_dataset:
2022-11-22 16:26:31 +01:00
2022-11-22 16:30:07 +01:00
2022-11-22 16:26:31 +01:00
if __name__ == '__main__':
2022-11-22 16:29:30 +01:00