Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,21 @@ CHANGELOG

1.1.0
-----
### Core
- Subitems in fields of type `JSONDict` (see below) can be accessed directly. E.g. you can do:
event['extra.foo'] = 'bar'
event['extra.foo'] # gives 'bar'
It is still possible to set and get the field as whole, however this may be removed or changed in the future:
event['extra'] = '{"foo": "bar"}'
event['extra'] # gives '{"foo": "bar"}'

### Bots
#### Collectors
- Mail: New parameters; `sent_from`: filter messages by sender, `sent_to`: filter messages by recipient

### Harmonization
- Renamed `JSON` to `JSONDict` and added a new type `JSON`. `JSONDict` saves data internally as JSON, but acts like a dictionary. `JSON` accepts any valid JSON.

### Requirements
- Requests is no longer a listed as dependency of the core. For depending bots the requirement is noted in their REQUIREMENTS.txt file

Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ See the changelog for a full list of changes.

1.1.0
-----
### Configuration
A new harmonization type `JSONDict` has been added specifically for the `extra` field. It is highly recommended to change the type of this field.

1.0.0
-----
Expand Down
11 changes: 10 additions & 1 deletion docs/Harmonization-fields.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Harmonization field names
|Event_Description|event_description.text|[String](#string)|A free-form textual description of an abuse event.|
|Event_Description|event_description.url|[URL](#url)|A description URL is a link to a further description of the the abuse event in question.|
| |event_hash|[UppercaseString](#uppercasestring)|Computed event hash with specific keys and values that identify a unique event. At present, the hash should default to using the SHA1 function. Please note that for an event hash to be able to match more than one event (deduplication) the receiver of an event should calculate it based on a minimal set of keys and values present in the event. Using for example the observation time in the calculation will most likely render the checksum useless for deduplication purposes.|
| |extra|[JSON](#json)|All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc. **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.|
| |extra|[JSONDict](#jsondict)|All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc. **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.|
|Feed|feed.accuracy|[Accuracy](#accuracy)|A float between 0 and 100 that represents how accurate the data in the feed is|
|Feed|feed.code|[String](#string)|Code name for the feed, e.g. DFGS, HSDAG etc.|
|Feed|feed.documentation|[String](#string)|A URL or hint where to find the documentation of this feed.|
Expand Down Expand Up @@ -158,6 +158,15 @@ Sanitation accepts strings and everything int() accepts.

JSON type.

Sanitation accepts any valid JSON objects.

Valid values are only unicode strings with JSON objects.


### JSONDict

JSONDict type.

Sanitation accepts pythons dictionaries and JSON strings.

Valid values are only unicode strings with JSON dictionaries.
Expand Down
2 changes: 1 addition & 1 deletion intelmq/bin/intelmq_psql_initdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def generate(harmonization_file=HARMONIZATION_CONF_FILE):
dbtype = 'real'
elif value['type'] == 'UUID':
dbtype = 'UUID'
elif value['type'] == 'JSON':
elif value['type'] in ('JSON', 'JSONDict'):
dbtype = 'json'
else:
raise ValueError('Unknown type %r.' % value['type'])
Expand Down
9 changes: 1 addition & 8 deletions intelmq/bots/parsers/generic/parser_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def parse(self, report):
def parse_line(self, row, report):
event = self.new_event(report)

extra = {}
for key, value in zip(self.columns, row):
regex = self.column_regex_search.get(key, None)
if regex:
Expand All @@ -85,18 +84,12 @@ def parse_line(self, row, report):
value = self.type_translation[value]
elif not hasattr(self.parameters, 'type'):
continue
if key.startswith('extra.'):
if value:
extra[key[6:]] = value
else:
event.add(key, value)
event.add(key, value)

if hasattr(self.parameters, 'type')\
and "classification.type" not in event:
event.add('classification.type', self.parameters.type)
event.add("raw", self.recover_line(row))
if extra:
event.add('extra', extra)
yield event

recover_line = ParserBot.recover_line_csv
Expand Down
2 changes: 2 additions & 0 deletions intelmq/bots/parsers/shadowserver/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,8 @@ def validate_fqdn(value):
('extra.', 'system', validate_to_none),
('extra.', 'detected_since', validate_to_none),
('extra.', 'server', validate_to_none),
('extra.', 'naics', invalidate_zero),
('extra.', 'sic', invalidate_zero),
],
'constant_fields': {
'classification.type': 'compromised',
Expand Down
2 changes: 1 addition & 1 deletion intelmq/etc/harmonization.conf
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@
},
"extra": {
"description": "All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc. **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.",
"type": "JSON"
"type": "JSONDict"
},
"feed.accuracy": {
"description": "A float between 0 and 100 that represents how accurate the data in the feed is",
Expand Down
53 changes: 50 additions & 3 deletions intelmq/lib/harmonization.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@

__all__ = ['Base64', 'Boolean', 'ClassificationType', 'DateTime', 'FQDN',
'Float', 'Accuracy', 'GenericType', 'IPAddress', 'IPNetwork',
'Integer', 'JSON', 'LowercaseString', 'Registry', 'String', 'URL',
'Integer', 'JSON', 'JSONDict', 'LowercaseString', 'Registry',
'String', 'URL',
]


Expand Down Expand Up @@ -523,6 +524,44 @@ class JSON(GenericType):
"""
JSON type.

Sanitation accepts any valid JSON objects.

Valid values are only unicode strings with JSON objects.
"""

@staticmethod
def is_valid(value, sanitize=False):
if sanitize:
value = JSON().sanitize(value)

if not isinstance(value, str):
return False

try:
json.loads(value)
except ValueError:
return False

return True

@staticmethod
def sanitize(value):
if value is None:
return None
if isinstance(value, (str, bytes)):
sanitized = GenericType.sanitize(value)
if JSON.is_valid(sanitized):
return sanitized
try:
return GenericType().sanitize(json.dumps(value, sort_keys=True))
except TypeError:
return None


class JSONDict(JSON):
"""
JSONDict type.

Sanitation accepts pythons dictionaries and JSON strings.

Valid values are only unicode strings with JSON dictionaries.
Expand All @@ -531,7 +570,7 @@ class JSON(GenericType):
@staticmethod
def is_valid(value, sanitize=False):
if sanitize:
value = JSON().sanitize(value)
value = JSONDict().sanitize(value)

if not isinstance(value, str):
return False
Expand All @@ -546,19 +585,27 @@ def is_valid(value, sanitize=False):

return False

@staticmethod
def is_valid_subitem(value):
return True

@staticmethod
def sanitize(value):
if not value:
return None
if isinstance(value, (str, bytes)):
sanitized = GenericType.sanitize(value)
if JSON.is_valid(sanitized):
if JSONDict.is_valid(sanitized):
return sanitized
try:
return GenericType().sanitize(json.dumps(value, sort_keys=True))
except TypeError:
return None

@staticmethod
def sanitize_subitem(value):
return value


class LowercaseString(GenericType):

Expand Down
66 changes: 55 additions & 11 deletions intelmq/lib/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

Use MessageFactory to get a Message object (types Report and Event).
"""
import functools
import hashlib
import json
import re
import warnings

import intelmq.lib.exceptions as exceptions
import intelmq.lib.harmonization
Expand Down Expand Up @@ -85,6 +85,8 @@ def serialize(message):

class Message(dict):

_IGNORED_VALUES = ["", "-", "N/A"]

def __init__(self, message=(), auto=False, harmonization=None):
try:
classname = message['__type'].lower()
Expand Down Expand Up @@ -114,6 +116,14 @@ def __init__(self, message=(), auto=False, harmonization=None):
def __setitem__(self, key, value):
self.add(key, value)

def __getitem__(self, key):
class_name, subitem = self.__get_type_config(key)
if class_name['type'] == 'JSONDict' and not subitem:
# return extra as string for backwards compatibility
return json.dumps(self.to_dict(hierarchical=True)[key.split('.')[0]])
else:
return super(Message, self).__getitem__(key)

def is_valid(self, key: str, value: str, sanitize: bool=True) -> bool:
"""
Checks if a value is valid for the key (after sanitation).
Expand Down Expand Up @@ -174,7 +184,7 @@ def add(self, key: str, value: str, sanitize: bool=True, force: bool=False,
if not overwrite and key in self:
raise exceptions.KeyExists(key)

if value is None or value in ["", "-", "N/A"]:
if value is None or value in self._IGNORED_VALUES:
if overwrite and key in self:
del self[key]
return
Expand Down Expand Up @@ -206,7 +216,19 @@ def add(self, key: str, value: str, sanitize: bool=True, force: bool=False,
else:
return False

super(Message, self).__setitem__(key, value)
class_name, subitem = self.__get_type_config(key)
if class_name and class_name['type'] == 'JSONDict' and not subitem:
# for backwards compatibility allow setting the extra field as string
for extrakey, extravalue in json.loads(value).items():
if hasattr(extravalue, '__len__'):
if not len(extravalue): # ignore empty values
continue
if extravalue in self._IGNORED_VALUES:
continue
super(Message, self).__setitem__('%s.%s' % (key, extrakey),
extravalue)
else:
super(Message, self).__setitem__(key, value)
return True

def update(self, other: dict):
Expand Down Expand Up @@ -251,17 +273,26 @@ def unserialize(message_string: str):
message = json.loads(message_string)
return message

@functools.lru_cache(maxsize=None)
def __is_valid_key(self, key: str):
if key in self.harmonization_config or key == '__type':
try:
class_name, subitem = self.__get_type_config(key)
except KeyError:
return False
if key in self.harmonization_config or key == '__type' or subitem:
return True
return False

def __is_valid_value(self, key: str, value: str):
if key == '__type':
return (True, )
config = self.__get_type_config(key)
config, subitem = self.__get_type_config(key)
class_reference = getattr(intelmq.lib.harmonization, config['type'])
if not class_reference().is_valid(value):
if not subitem:
validation = class_reference().is_valid(value)
else:
validation = class_reference().is_valid_subitem(value)
if not validation:
return (False, 'is_valid returned False.')
if 'length' in config:
length = len(str(value))
Expand All @@ -277,13 +308,26 @@ def __is_valid_value(self, key: str, value: str):
return (True, )

def __sanitize_value(self, key: str, value: str):
class_name = self.__get_type_config(key)['type']
class_reference = getattr(intelmq.lib.harmonization, class_name)
return class_reference().sanitize(value)
class_name, subitem = self.__get_type_config(key)
class_reference = getattr(intelmq.lib.harmonization, class_name['type'])
if not subitem:
return class_reference().sanitize(value)
else:
return class_reference().sanitize_subitem(value)

@functools.lru_cache(maxsize=None)
def __get_type_config(self, key: str):
class_name = self.harmonization_config[key]
return class_name
if key == '__type':
return None, None
try:
class_name = self.harmonization_config[key]
except KeyError:
# Could be done recursively in the future if needed
class_name = self.harmonization_config[key.split('.')[0]]
subitem = True
else:
subitem = False
return class_name, subitem

def __hash__(self):
return int(self.hash(), 16)
Expand Down
41 changes: 41 additions & 0 deletions intelmq/tests/bots/experts/filter/test_extra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-

import unittest

import intelmq.lib.test as test
from intelmq.bots.experts.filter.expert import FilterExpertBot

EXAMPLE_INPUT = {"__type": "Event",
"classification.type": "defacement",
"time.source": "2005-01-01T00:00:00+00:00",
"source.asn": 123,
"extra.test1": True,
"extra.test2": "bla",
}


class TestFilterExpertBot(test.BotTestCase, unittest.TestCase):
"""
A TestCase for FilterExpertBot.
"""

@classmethod
def set_bot(cls):
cls.bot_reference = FilterExpertBot
cls.input_message = EXAMPLE_INPUT
cls.sysconfig = {'filter_key': 'extra.test1',
'filter_value': True,
'filter_action': 'drop'}

def test_extra_filter_drop(self):
self.run_bot()

def test_extra_filter_keep(self):
self.sysconfig = {'filter_key': 'extra.test2',
'filter_value': 'bla',
'filter_action': 'keep'}
self.run_bot()
self.assertMessageEqual(0, EXAMPLE_INPUT)

if __name__ == '__main__': # pragma: no cover
unittest.main()
2 changes: 1 addition & 1 deletion intelmq/tests/bots/outputs/mongodb/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"extra": '{"foo.bar": "test"}'
}
OUTPUT1 = {'classification': {'type': 'botnet drone'},
'extra': '{"foo.bar": "test"}',
'extra': {"foo": {"bar": "test"}},
'feed': {'name': 'Example Feed'},
'source': {'asn': 64496, 'ip': '192.0.2.1'},
}
Expand Down
2 changes: 1 addition & 1 deletion intelmq/tests/bots/outputs/redis/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"source.port": 65118,
"__type": "Event",
"feed.name": "BitSight",
"extra": '{"non_ascii": "ççãããã\x80\ua000 \164 \x80\x80 abcd \165\166"}',
"extra.non_ascii": "ççãããã\x80\ua000 \164 \x80\x80 abcd \165\166",
"raw": "eyJ0cm9qYW5mYW1pbHkiOiJTYWxpdHlwMnAiLCJlbnYiOnsic"
"mVtb3RlX2FkZHIiOiIxNTIuMTY2LjExOS4yIiwicmVtb3RlX3"
"BvcnQiOiI2NTExOCIsInNlcnZlcl9hZGRyIjoiNTIuMTguMTk"
Expand Down
Loading