certtools · Aug 23, 2017 · Jul 18, 2017 · Jul 24, 2017 · Jul 25, 2017 · Aug 16, 2017
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,11 +3,21 @@ CHANGELOG
 
 1.1.0
 -----
+### Core
+- Subitems in fields of type `JSONDict` (see below) can be accessed directly. E.g. you can do:
+  event['extra.foo'] = 'bar'
+  event['extra.foo'] # gives 'bar'
+  It is still possible to set and get the field as whole, however this may be removed or changed in the future:
+  event['extra'] = '{"foo": "bar"}'
+  event['extra'] # gives '{"foo": "bar"}'
 
 ### Bots
 #### Collectors
 - Mail: New parameters; `sent_from`: filter messages by sender, `sent_to`: filter messages by recipient
 
+### Harmonization
+- Renamed `JSON` to `JSONDict` and added a new type `JSON`. `JSONDict` saves data internally as JSON, but acts like a dictionary. `JSON` accepts any valid JSON.
+
 ### Requirements
 - Requests is no longer a listed as dependency of the core. For depending bots the requirement is noted in their REQUIREMENTS.txt file
 

diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,8 @@ See the changelog for a full list of changes.
 
 1.1.0
 -----
+### Configuration
+A new harmonization type `JSONDict` has been added specifically for the `extra` field. It is highly recommended to change the type of this field.
 
 1.0.0
 -----

diff --git a/docs/Harmonization-fields.md b/docs/Harmonization-fields.md
@@ -34,7 +34,7 @@ Harmonization field names
 |Event_Description|event_description.text|[String](#string)|A free-form textual description of an abuse event.|
 |Event_Description|event_description.url|[URL](#url)|A description URL is a link to a further description of the the abuse event in question.|
 | |event_hash|[UppercaseString](#uppercasestring)|Computed event hash with specific keys and values that identify a unique event. At present, the hash should default to using the SHA1 function. Please note that for an event hash to be able to match more than one event (deduplication) the receiver of an event should calculate it based on a minimal set of keys and values present in the event. Using for example the observation time in the calculation will most likely render the checksum useless for deduplication purposes.|
-| |extra|[JSON](#json)|All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc.  **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.|
+| |extra|[JSONDict](#jsondict)|All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc.  **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.|
 |Feed|feed.accuracy|[Accuracy](#accuracy)|A float between 0 and 100 that represents how accurate the data in the feed is|
 |Feed|feed.code|[String](#string)|Code name for the feed, e.g. DFGS, HSDAG etc.|
 |Feed|feed.documentation|[String](#string)|A URL or hint where to find the documentation of this feed.|
@@ -158,6 +158,15 @@ Sanitation accepts strings and everything int() accepts.
 
 JSON type.
 
+Sanitation accepts any valid JSON objects.
+
+Valid values are only unicode strings with JSON objects.
+
+
+### JSONDict
+
+JSONDict type.
+
 Sanitation accepts pythons dictionaries and JSON strings.
 
 Valid values are only unicode strings with JSON dictionaries.

diff --git a/intelmq/bin/intelmq_psql_initdb.py b/intelmq/bin/intelmq_psql_initdb.py
@@ -55,7 +55,7 @@ def generate(harmonization_file=HARMONIZATION_CONF_FILE):
             dbtype = 'real'
         elif value['type'] == 'UUID':
             dbtype = 'UUID'
-        elif value['type'] == 'JSON':
+        elif value['type'] in ('JSON', 'JSONDict'):
             dbtype = 'json'
         else:
             raise ValueError('Unknown type %r.' % value['type'])

diff --git a/intelmq/bots/parsers/generic/parser_csv.py b/intelmq/bots/parsers/generic/parser_csv.py
@@ -64,7 +64,6 @@ def parse(self, report):
     def parse_line(self, row, report):
         event = self.new_event(report)
 
-        extra = {}
         for key, value in zip(self.columns, row):
             regex = self.column_regex_search.get(key, None)
             if regex:
@@ -85,18 +84,12 @@ def parse_line(self, row, report):
                     value = self.type_translation[value]
                 elif not hasattr(self.parameters, 'type'):
                     continue
-            if key.startswith('extra.'):
-                if value:
-                    extra[key[6:]] = value
-            else:
-                event.add(key, value)
+            event.add(key, value)
 
         if hasattr(self.parameters, 'type')\
                 and "classification.type" not in event:
             event.add('classification.type', self.parameters.type)
         event.add("raw", self.recover_line(row))
-        if extra:
-            event.add('extra', extra)
         yield event
 
     recover_line = ParserBot.recover_line_csv

diff --git a/intelmq/bots/parsers/shadowserver/config.py b/intelmq/bots/parsers/shadowserver/config.py
@@ -919,6 +919,8 @@ def validate_fqdn(value):
         ('extra.', 'system', validate_to_none),
         ('extra.', 'detected_since', validate_to_none),
         ('extra.', 'server', validate_to_none),
+        ('extra.', 'naics', invalidate_zero),
+        ('extra.', 'sic', invalidate_zero),
     ],
     'constant_fields': {
         'classification.type': 'compromised',

diff --git a/intelmq/etc/harmonization.conf b/intelmq/etc/harmonization.conf
@@ -130,7 +130,7 @@
         },
         "extra": {
             "description": "All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc.  **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.",
-            "type": "JSON"
+            "type": "JSONDict"
         },
         "feed.accuracy": {
             "description": "A float between 0 and 100 that represents how accurate the data in the feed is",

diff --git a/intelmq/lib/harmonization.py b/intelmq/lib/harmonization.py
@@ -31,7 +31,8 @@
 
 __all__ = ['Base64', 'Boolean', 'ClassificationType', 'DateTime', 'FQDN',
            'Float', 'Accuracy', 'GenericType', 'IPAddress', 'IPNetwork',
-           'Integer', 'JSON', 'LowercaseString', 'Registry', 'String', 'URL',
+           'Integer', 'JSON', 'JSONDict', 'LowercaseString', 'Registry',
+           'String', 'URL',
            ]
 
 
@@ -523,6 +524,44 @@ class JSON(GenericType):
     """
     JSON type.
 
+    Sanitation accepts any valid JSON objects.
+
+    Valid values are only unicode strings with JSON objects.
+    """
+
+    @staticmethod
+    def is_valid(value, sanitize=False):
+        if sanitize:
+            value = JSON().sanitize(value)
+
+        if not isinstance(value, str):
+            return False
+
+        try:
+            json.loads(value)
+        except ValueError:
+            return False
+
+        return True
+
+    @staticmethod
+    def sanitize(value):
+        if value is None:
+            return None
+        if isinstance(value, (str, bytes)):
+            sanitized = GenericType.sanitize(value)
+            if JSON.is_valid(sanitized):
+                return sanitized
+        try:
+            return GenericType().sanitize(json.dumps(value, sort_keys=True))
+        except TypeError:
+            return None
+
+
+class JSONDict(JSON):
+    """
+    JSONDict type.
+
     Sanitation accepts pythons dictionaries and JSON strings.
 
     Valid values are only unicode strings with JSON dictionaries.
@@ -531,7 +570,7 @@ class JSON(GenericType):
     @staticmethod
     def is_valid(value, sanitize=False):
         if sanitize:
-            value = JSON().sanitize(value)
+            value = JSONDict().sanitize(value)
 
         if not isinstance(value, str):
             return False
@@ -546,19 +585,27 @@ def is_valid(value, sanitize=False):
 
         return False
 
+    @staticmethod
+    def is_valid_subitem(value):
+        return True
+
     @staticmethod
     def sanitize(value):
         if not value:
             return None
         if isinstance(value, (str, bytes)):
             sanitized = GenericType.sanitize(value)
-            if JSON.is_valid(sanitized):
+            if JSONDict.is_valid(sanitized):
                 return sanitized
         try:
             return GenericType().sanitize(json.dumps(value, sort_keys=True))
         except TypeError:
             return None
 
+    @staticmethod
+    def sanitize_subitem(value):
+        return value
+
 
 class LowercaseString(GenericType):
 

diff --git a/intelmq/lib/message.py b/intelmq/lib/message.py
@@ -4,10 +4,10 @@
 
 Use MessageFactory to get a Message object (types Report and Event).
 """
+import functools
 import hashlib
 import json
 import re
-import warnings
 
 import intelmq.lib.exceptions as exceptions
 import intelmq.lib.harmonization
@@ -85,6 +85,8 @@ def serialize(message):
 
 class Message(dict):
 
+    _IGNORED_VALUES = ["", "-", "N/A"]
+
     def __init__(self, message=(), auto=False, harmonization=None):
         try:
             classname = message['__type'].lower()
@@ -114,6 +116,14 @@ def __init__(self, message=(), auto=False, harmonization=None):
     def __setitem__(self, key, value):
         self.add(key, value)
 
+    def __getitem__(self, key):
+        class_name, subitem = self.__get_type_config(key)
+        if class_name['type'] == 'JSONDict' and not subitem:
+            # return extra as string for backwards compatibility
+            return json.dumps(self.to_dict(hierarchical=True)[key.split('.')[0]])
+        else:
+            return super(Message, self).__getitem__(key)
+
     def is_valid(self, key: str, value: str, sanitize: bool=True) -> bool:
         """
         Checks if a value is valid for the key (after sanitation).
@@ -174,7 +184,7 @@ def add(self, key: str, value: str, sanitize: bool=True, force: bool=False,
         if not overwrite and key in self:
             raise exceptions.KeyExists(key)
 
-        if value is None or value in ["", "-", "N/A"]:
+        if value is None or value in self._IGNORED_VALUES:
             if overwrite and key in self:
                 del self[key]
             return
@@ -206,7 +216,19 @@ def add(self, key: str, value: str, sanitize: bool=True, force: bool=False,
             else:
                 return False
 
-        super(Message, self).__setitem__(key, value)
+        class_name, subitem = self.__get_type_config(key)
+        if class_name and class_name['type'] == 'JSONDict' and not subitem:
+            # for backwards compatibility allow setting the extra field as string
+            for extrakey, extravalue in json.loads(value).items():
+                if hasattr(extravalue, '__len__'):
+                    if not len(extravalue):  # ignore empty values
+                        continue
+                if extravalue in self._IGNORED_VALUES:
+                    continue
+                super(Message, self).__setitem__('%s.%s' % (key, extrakey),
+                                                 extravalue)
+        else:
+            super(Message, self).__setitem__(key, value)
         return True
 
     def update(self, other: dict):
@@ -251,17 +273,26 @@ def unserialize(message_string: str):
         message = json.loads(message_string)
         return message
 
+    @functools.lru_cache(maxsize=None)
     def __is_valid_key(self, key: str):
-        if key in self.harmonization_config or key == '__type':
+        try:
+            class_name, subitem = self.__get_type_config(key)
+        except KeyError:
+            return False
+        if key in self.harmonization_config or key == '__type' or subitem:
             return True
         return False
 
     def __is_valid_value(self, key: str, value: str):
         if key == '__type':
             return (True, )
-        config = self.__get_type_config(key)
+        config, subitem = self.__get_type_config(key)
         class_reference = getattr(intelmq.lib.harmonization, config['type'])
-        if not class_reference().is_valid(value):
+        if not subitem:
+            validation = class_reference().is_valid(value)
+        else:
+            validation = class_reference().is_valid_subitem(value)
+        if not validation:
             return (False, 'is_valid returned False.')
         if 'length' in config:
             length = len(str(value))
@@ -277,13 +308,26 @@ def __is_valid_value(self, key: str, value: str):
         return (True, )
 
     def __sanitize_value(self, key: str, value: str):
-        class_name = self.__get_type_config(key)['type']
-        class_reference = getattr(intelmq.lib.harmonization, class_name)
-        return class_reference().sanitize(value)
+        class_name, subitem = self.__get_type_config(key)
+        class_reference = getattr(intelmq.lib.harmonization, class_name['type'])
+        if not subitem:
+            return class_reference().sanitize(value)
+        else:
+            return class_reference().sanitize_subitem(value)
 
+    @functools.lru_cache(maxsize=None)
     def __get_type_config(self, key: str):
-        class_name = self.harmonization_config[key]
-        return class_name
+        if key == '__type':
+            return None, None
+        try:
+            class_name = self.harmonization_config[key]
+        except KeyError:
+            # Could be done recursively in the future if needed
+            class_name = self.harmonization_config[key.split('.')[0]]
+            subitem = True
+        else:
+            subitem = False
+        return class_name, subitem
 
     def __hash__(self):
         return int(self.hash(), 16)

diff --git a/intelmq/tests/bots/experts/filter/test_extra.py b/intelmq/tests/bots/experts/filter/test_extra.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+
+import intelmq.lib.test as test
+from intelmq.bots.experts.filter.expert import FilterExpertBot
+
+EXAMPLE_INPUT = {"__type": "Event",
+                 "classification.type": "defacement",
+                 "time.source": "2005-01-01T00:00:00+00:00",
+                 "source.asn": 123,
+                 "extra.test1": True,
+                 "extra.test2": "bla",
+                 }
+
+
+class TestFilterExpertBot(test.BotTestCase, unittest.TestCase):
+    """
+    A TestCase for FilterExpertBot.
+    """
+
+    @classmethod
+    def set_bot(cls):
+        cls.bot_reference = FilterExpertBot
+        cls.input_message = EXAMPLE_INPUT
+        cls.sysconfig = {'filter_key': 'extra.test1',
+                         'filter_value': True,
+                         'filter_action': 'drop'}
+
+    def test_extra_filter_drop(self):
+        self.run_bot()
+
+    def test_extra_filter_keep(self):
+        self.sysconfig = {'filter_key': 'extra.test2',
+                         'filter_value': 'bla',
+                         'filter_action': 'keep'}
+        self.run_bot()
+        self.assertMessageEqual(0, EXAMPLE_INPUT)
+
+if __name__ == '__main__':  # pragma: no cover
+    unittest.main()
diff --git a/intelmq/tests/bots/outputs/mongodb/test_output.py b/intelmq/tests/bots/outputs/mongodb/test_output.py
@@ -16,7 +16,7 @@
           "extra": '{"foo.bar": "test"}'
           }
 OUTPUT1 = {'classification': {'type': 'botnet drone'},
-           'extra': '{"foo.bar": "test"}',
+           'extra': {"foo": {"bar": "test"}},
            'feed': {'name': 'Example Feed'},
            'source': {'asn': 64496, 'ip': '192.0.2.1'},
            }

diff --git a/intelmq/tests/bots/outputs/redis/test_output.py b/intelmq/tests/bots/outputs/redis/test_output.py
@@ -23,7 +23,7 @@
                  "source.port": 65118,
                  "__type": "Event",
                  "feed.name": "BitSight",
-                 "extra": '{"non_ascii": "ççãããã\x80\ua000 \164 \x80\x80 abcd \165\166"}',
+                 "extra.non_ascii": "ççãããã\x80\ua000 \164 \x80\x80 abcd \165\166",
                  "raw": "eyJ0cm9qYW5mYW1pbHkiOiJTYWxpdHlwMnAiLCJlbnYiOnsic"
                  "mVtb3RlX2FkZHIiOiIxNTIuMTY2LjExOS4yIiwicmVtb3RlX3"
                  "BvcnQiOiI2NTExOCIsInNlcnZlcl9hZGRyIjoiNTIuMTguMTk"
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,8 @@ See the changelog for a full list of changes. @@
 .1.0
     -----
+    ### Configuration
+    A new harmonization type `JSONDict` has been added specifically for the `extra` field. It is highly recommended to change the type of this field.
 .0.0
     -----
@@ Expand Down @@