Skip to content
Snippets Groups Projects
Commit 57503890 authored by Jayesh's avatar Jayesh
Browse files

perplexity and coherence analysis

parent c26d4b62
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
```
%% Cell type:code id: tags:
``` python
# Load and read the document-term matrix and ontology
dtm_path = r"E:\\DSSE\\DSSE-Group-7\\Assignment_2\\Week 2\\task1\\yarn_document_term_matrix.csv"
ontology_path = r"E:\\DSSE\\DSSE-Group-7\\Assignment_2\\Week 2\\task2-iter2\\ontology_sheet.xlsx"
dtm = pd.read_csv(dtm_path)
ontology = pd.read_excel(ontology_path)
```
%% Cell type:code id: tags:
``` python
# Convert term frequency columns to numeric
dtm.iloc[:, 1:] = dtm.iloc[:, 1:].apply(pd.to_numeric)
```
%% Cell type:code id: tags:
``` python
dtm.head(5)
```
%% Output
Issue key ability able abort absence absolute absolutely abstract \
0 YARN-10930 0 0 0 0 3 0 0
1 YARN-10562 0 0 0 0 0 0 0
2 YARN-10514 0 0 0 0 0 0 0
3 YARN-10494 0 0 0 0 0 0 0
4 YARN-10493 0 0 0 0 0 0 0
abstraction abstractservice ... yes youd yufeigu zero zk \
0 0 0 ... 0 0 0 0 0
1 0 0 ... 0 0 0 0 0
2 0 0 ... 0 0 0 0 0
3 0 0 ... 0 0 0 0 0
4 0 0 ... 0 0 0 0 0
zkcuratormanager zkrmstatestore znode zookeeper ztang
0 0 0 0 0 0
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
[5 rows x 2938 columns]
%% Cell type:code id: tags:
``` python
# Extract relevant class from the ontology dataframe
ontology_dict = {}
for col in ontology.columns:
for term in ontology[col].dropna().unique():
ontology_dict[term.strip()] = col.strip()
ontology_dict
```
%% Output
{'method': 'SoftwareArtifact_',
'methods': 'SoftwareArtifact_',
'stored procedure': 'SoftwareArtifact_',
'procedure': 'SoftwareArtifact_',
'procedures': 'SoftwareArtifact_',
'operation': 'SoftwareArtifact_',
'operations': 'SoftwareArtifact_',
'interface': 'SoftwareArtifact_',
'interfaces': 'SoftwareArtifact_',
'field': 'SoftwareArtifact_',
'fields': 'SoftwareArtifact_',
'event': 'SoftwareArtifact_',
'events': 'SoftwareArtifact_',
'button': 'SoftwareArtifact_',
'buttons': 'SoftwareArtifact_',
'text box': 'SoftwareArtifact_',
'endpoint': 'Pattern_',
'endpoints': 'SoftwareArtifact_',
'job': 'SoftwareArtifact_',
'jobs': 'SoftwareArtifact_',
'function': 'ProgrammingConcept_',
'functions': 'SoftwareArtifact_',
'class': 'SoftwareArtifact_',
'classes': 'SoftwareArtifact_',
'file': 'SoftwareArtifact_',
'files': 'SoftwareArtifact_',
'module': 'SoftwareArtifact_',
'modules': 'SoftwareArtifact_',
'abstract': 'SoftwareArtifact_',
'legacy': 'Component_',
'component': 'Component_',
'components': 'Component_',
'application': 'Component_',
'applications': 'Component_',
'machine': 'Component_',
'machines': 'Component_',
'server': 'Pattern_',
'servers': 'Technology_',
'client': 'Pattern_',
'clients': 'Technology_',
'users': 'Component_',
'user': 'Component_',
'back end': 'Component_',
'back ends': 'Component_',
'service': 'Component_',
'services': 'Component_',
'system': 'Component_',
'systems': 'Component_',
'thread': 'Component_',
'threads': 'Component_',
'platform': 'Technology_',
'platforms': 'Component_',
'devices': 'Component_',
'device': 'Component_',
'source': 'Component_',
'sources': 'Component_',
'beans': 'Component_',
'bean': 'Component_',
'table': 'Component_',
'tables': 'Component_',
'record': 'Component_',
'records': 'Component_',
'queue': 'Pattern_',
'queues': 'Component_',
'front ends': 'Component_',
'front end': 'Component_',
'mainframe': 'Component_',
'host': 'Component_',
'hosts': 'Component_',
'APIs': 'Component_',
'API': 'Component_',
'app': 'Component_',
'apps': 'Component_',
'boxes': 'Component_',
'box': 'Component_',
'Q': 'Component_',
'physical locations': 'Component_',
'physical location': 'Component_',
'engine': 'Component_',
'engines': 'Component_',
'process': 'Component_',
'processes': 'Component_',
'program': 'Component_',
'programs': 'Component_',
'listener': 'Component_',
'Listeners': 'Component_',
'workers': 'Component_',
'worker': 'Component_',
'computers': 'Component_',
'computer': 'Component_',
'model': 'Component_',
'models': 'Component_',
'entities': 'Component_',
'entity': 'Component_',
'data store': 'Component_',
'datastore': 'Component_',
'store': 'Connector_',
'context': 'Component_',
'ViewModels': 'Component_',
'database': 'Technology_',
'producers': 'Component_',
'producer': 'Component_',
'consumers': 'Component_',
'consumer': 'Component_',
'session': 'Component_',
'sessions': 'Component_',
'logic': 'Component_',
'page': 'Component_',
'partition': 'Component_',
'back_end': 'Component_',
'front_end': 'Component_',
'rm': 'Component_',
'nm': 'Component_',
'resourcemanager': 'Component_',
'resource': 'Component_',
'nodemanager': 'Component_',
'socket': 'Connector_Data_',
'data': 'Connector_Data_',
'payload': 'Connector_Data_',
'payloads': 'Connector_Data_',
'object': 'Connector_Data_',
'objects': 'Connector_Data_',
'messages': 'Connector_Data_',
'message': 'Connector_Data_',
'XML': 'Technology_',
'dump': 'Connector_',
'updates': 'Connector_Data_',
'requests': 'Connector_Data_',
'request': 'Connector_Data_',
'map': 'Connector_Data_',
'structure': 'Connector_Data_',
'structures': 'Connector_Data_',
'maps': 'Connector_Data_',
'reply': 'Connector_Data_',
'replies': 'Connector_Data_',
'item': 'Connector_Data_',
'items': 'Connector_Data_',
'list': 'Connector_Data_',
'lists': 'Connector_Data_',
'result': 'Connector_Data_',
'results': 'Connector_Data_',
'tasks': 'Connector_Data_',
'task': 'Connector_Data_',
'information': 'Connector_Data_',
'call': 'Connector_',
'calls': 'Connector_',
'kb': 'Connector_Data_',
'mb': 'Connector_Data_',
'notification': 'Connector_Data_',
'token': 'Connector_Data_',
'retrieve': 'Connector_',
'retrieves': 'Connector_',
'retrieving': 'Connector_',
'commit': 'Connector_',
'commits': 'Connector_',
'committing': 'Connector_',
'consuming': 'Connector_',
'consume': 'Connector_',
'consumes': 'Connector_',
'communication': 'Connector_',
'communications': 'Connector_',
'communicate': 'Connector_',
'communicated': 'Connector_',
'communicates': 'Connector_',
'execute': 'Connector_',
'executes': 'Connector_',
'execution': 'Connector_',
'connect': 'Connector_',
'connected': 'Connector_',
'connects': 'Connector_',
'connection': 'Connector_',
'connectivity': 'Connector_',
'write': 'Connector_',
'writes': 'Connector_',
'writing': 'Connector_',
'upload': 'Connector_',
'uploading': 'Connector_',
'uploaded': 'Connector_',
'work with': 'Connector_',
'send': 'Connector_',
'sends': 'Connector_',
'sending': 'Connector_',
'sent': 'Connector_',
'implementing': 'Connector_',
'implement': 'Connector_',
'implements': 'Connector_',
'stores': 'Connector_',
'storing': 'Connector_',
'forward': 'Connector_',
'forwards': 'Connector_',
'forwarding': 'Connector_',
'check': 'Connector_',
'checking': 'Connector_',
'checks': 'Connector_',
'changes': 'Connector_',
'change': 'Connector_',
'stream': 'Pattern_',
'streams': 'Connector_',
'streaming': 'Connector_',
'receive': 'Connector_',
'receives': 'Connector_',
'received': 'Connector_',
'receiving': 'Connector_',
'deliver': 'Connector_',
'delivers': 'Connector_',
'delivering': 'Connector_',
'calling': 'Connector_',
'called': 'Connector_',
'talk with': 'Connector_',
'expose': 'Connector_',
'exposes': 'Connector_',
'exposing': 'Connector_',
'accessing': 'Connector_',
'access': 'Connector_',
'accessed': 'Connector_',
'interacting': 'Connector_',
'interact': 'Connector_',
'interacts': 'Connector_',
'interaction': 'Connector_',
'fed from': 'Connector_',
'read': 'Connector_',
'reads': 'Connector_',
'reading': 'Connector_',
'dumps': 'Connector_',
'dumping': 'Connector_',
'grab': 'Connector_',
'grabs': 'Connector_',
'pushing': 'Connector_',
'push': 'Connector_',
'link': 'Connector_',
'links': 'Connector_',
'linking': 'Connector_',
'routing': 'Pattern_',
'route': 'Connector_',
'share': 'Connector_',
'shares': 'Connector_',
'shared': 'Connector_',
'sharing': 'Connector_',
'gets': 'Connector_',
'get': 'Connector_',
'getting': 'Connector_',
'pull': 'Connector_',
'pulls': 'Connector_',
'pulling': 'Connector_',
'collect': 'Connector_',
'collects': 'Connector_',
'collecting': 'Connector_',
'save': 'Connector_',
'saving': 'Connector_',
'saves': 'Connector_',
'accept': 'Connector_',
'accepting': 'Connector_',
'accespts': 'Connector_',
'queries': 'Connector_',
'transmit': 'Connector_',
'transmits': 'Connector_',
'transmitted': 'Connector_',
'transmitting': 'Connector_',
'Â\xadexposing': 'Connector_',
'exposed': 'Connector_',
'trigger': 'Connector_',
'triggers': 'Connector_',
'triggering': 'Connector_',
'triggered': 'Connector_',
'notify': 'Connector_',
'notifies': 'Connector_',
'notified': 'Connector_',
'notifing': 'Connector_',
'dependency': 'Connector_',
'rout': 'Connector_',
'sink': 'Connector_',
'response': 'Connector_',
'address': 'Connector_',
'pattern': 'Pattern_',
'tactic': 'Pattern_',
'blocking': 'Pattern_',
'synchronous': 'Pattern_',
'sync': 'Pattern_',
'half-sync': 'Pattern_',
'half sync': 'Pattern_',
'asynchronous': 'Pattern_',
'async': 'Pattern_',
'half-async': 'Pattern_',
'half async': 'Pattern_',
'non-blocking': 'Pattern_',
'broadcast': 'Pattern_',
'messaging': 'Pattern_',
'broker': 'Pattern_',
'layer': 'Pattern_',
'client/server': 'Pattern_',
'client-server': 'Pattern_',
'client server': 'Pattern_',
'callback': 'Pattern_',
'loosely coupled': 'Pattern_',
'multicast': 'Pattern_',
'MVC': 'Pattern_',
'modal view controller': 'Pattern_',
'modalviewcontroller': 'Pattern_',
'MVP': 'Pattern_',
'MVVM': 'Pattern_',
'controller': 'Pattern_',
'view': 'Pattern_',
'NIO': 'Technology_',
'queuing': 'Pattern_',
'fifo': 'Pattern_',
'publish subscribe': 'Pattern_',
'publish-subscribe': 'Pattern_',
'subscribe/publish': 'Pattern_',
'subscribe-publish': 'Pattern_',
'publish/subscribe': 'Pattern_',
'publish and subscribe': 'Pattern_',
'pub/sub': 'Pattern_',
'pub sub': 'Pattern_',
'pub-sub': 'Pattern_',
'event based': 'Pattern_',
'event-based': 'Pattern_',
'event/based': 'Pattern_',
'event driven': 'Pattern_',
'event-driven': 'Pattern_',
'event/driven': 'Pattern_',
'REST': 'Technology_',
'RESTful': 'Pattern_',
'translator': 'Pattern_',
'translate': 'Pattern_',
'router': 'Pattern_',
'channel': 'Pattern_',
'dispatcher': 'Pattern_',
'publisher': 'Pattern_',
'subscriber': 'Pattern_',
'rpc': 'Technology_',
'remote procedure': 'Pattern_',
'remote-procedure': 'Pattern_',
'remote/procedure': 'Pattern_',
'poll': 'Pattern_',
'polling': 'Pattern_',
'peer to peer': 'Pattern_',
'peer-to-peer': 'Pattern_',
'peer2peer': 'Pattern_',
'p2p': 'Pattern_',
'supernode': 'Pattern_',
'service-oriented': 'Pattern_',
'service oriented': 'Pattern_',
'service/oriented': 'Pattern_',
'SOA': 'Pattern_',
'Software as a service': 'Pattern_',
'SaaS': 'Pattern_',
'store-and-forward queue': 'Pattern_',
'shared repository': 'Pattern_',
'shared-repository': 'Pattern_',
'active repository': 'Pattern_',
'active-repository': 'Pattern_',
'repository': 'Pattern_',
'blackboard': 'Pattern_',
'proxy': 'Pattern_',
'resource pool': 'Pattern_',
'cache': 'Pattern_',
'lookup': 'Pattern_',
'evictor': 'Pattern_',
'handler': 'Pattern_',
'reactor': 'Pattern_',
'master-slave': 'Pattern_',
'master slave': 'Pattern_',
'master/slave': 'Pattern_',
'encapsulation': 'Pattern_',
'encapsulate': 'Pattern_',
'intermediary': 'Pattern_',
'cohesion': 'Pattern_',
'replication': 'Pattern_',
'interceptor': 'Pattern_',
'reflection': 'Pattern_',
'interpreter': 'Pattern_',
'microkernel': 'Pattern_',
'pipes and filters': 'Pattern_',
'pipe': 'Pattern_',
'filter': 'Pattern_',
'facade': 'Pattern_',
'monitor': 'Pattern_',
'monitoring': 'Pattern_',
'ping': 'Pattern_',
'heartbeat': 'Pattern_',
'timestamp': 'Pattern_',
'sanity': 'Pattern_',
'voting': 'Pattern_',
'redundancy': 'Pattern_',
'shadow': 'Pattern_',
'orchestration': 'Pattern_',
'orchestrate': 'Pattern_',
'coherence': 'Pattern_',
'concurrency': 'Pattern_',
'authenticate': 'Pattern_',
'authentication': 'Pattern_',
'authorise': 'Pattern_',
'authorize': 'Pattern_',
'authorization': 'Pattern_',
'encrypt': 'Pattern_',
'encryption': 'Pattern_',
'ipc': 'Pattern_',
'load balancing': 'Pattern_',
'load-balancing': 'Pattern_',
'load balance': 'Pattern_',
'load-balance': 'Pattern_',
'multi-tenant': 'Pattern_',
'multi tenant': 'Pattern_',
'multitenant': 'Pattern_',
'named-pipes': 'Pattern_',
'namedpipes': 'Pattern_',
'named pipes': 'Pattern_',
'data-ingest': 'Pattern_',
'dataingest': 'Pattern_',
'transactional': 'Pattern_',
'stream-storage': 'Pattern_',
'streamstorage': 'Pattern_',
'stream-processing': 'Pattern_',
'streamprocessing': 'Pattern_',
'structured': 'Pattern_',
'unstructured': 'Pattern_',
'hot': 'Pattern_',
'warm': 'Pattern_',
'cold': 'Pattern_',
'batch': 'Pattern_',
'batch-processing': 'Pattern_',
'temperature': 'Pattern_',
'interactive analytics': 'Pattern_',
'interactive-analytics': 'Pattern_',
'microservice': 'Pattern_',
'microservices': 'Pattern_',
'esp': 'Pattern_',
'bi': 'Pattern_',
'accessibility': 'Quality_Attribute_',
'accessible': 'Quality_Attribute_',
'accountability': 'Quality_Attribute_',
'accountable': 'Quality_Attribute_',
'accuracy': 'Quality_Attribute_',
'accurate': 'Quality_Attribute_',
'adaptability': 'Quality_Attribute_',
'adaptable': 'Quality_Attribute_',
'adapte': 'Quality_Attribute_',
'administrability': 'Quality_Attribute_',
'administrable': 'Quality_Attribute_',
'affordability': 'Quality_Attribute_',
'affordable': 'Quality_Attribute_',
'auditability': 'Quality_Attribute_',
'auditable': 'Quality_Attribute_',
'availability': 'Quality_Attribute_',
'available': 'Quality_Attribute_',
'compatibility': 'Quality_Attribute_',
'compatible': 'Quality_Attribute_',
'composability': 'Quality_Attribute_',
'composable': 'Quality_Attribute_',
'configurability': 'Quality_Attribute_',
'configurable': 'Quality_Attribute_',
'correctness': 'Quality_Attribute_',
'credibility': 'Quality_Attribute_',
'credible': 'Quality_Attribute_',
'customizability': 'Quality_Attribute_',
'customizable': 'Quality_Attribute_',
'debuggability': 'Quality_Attribute_',
'debuggable': 'Quality_Attribute_',
'debatable': 'Quality_Attribute_',
'degradability': 'Quality_Attribute_',
'degradable': 'Quality_Attribute_',
'determinability': 'Quality_Attribute_',
'determinable': 'Quality_Attribute_',
'demonstrability': 'Quality_Attribute_',
'demonstrable': 'Quality_Attribute_',
'dependability': 'Quality_Attribute_',
'dependable': 'Quality_Attribute_',
'depend': 'Quality_Attribute_',
'deployability': 'Quality_Attribute_',
'deployable': 'Quality_Attribute_',
'deploy': 'Quality_Attribute_',
'distributability': 'Quality_Attribute_',
'distributable': 'Quality_Attribute_',
'distribute': 'Quality_Attribute_',
'durability': 'Quality_Attribute_',
'durable': 'Quality_Attribute_',
'effectiveness': 'Quality_Attribute_',
'effective': 'Quality_Attribute_',
'efficiency': 'Quality_Attribute_',
'efficient': 'Quality_Attribute_',
'evolvability': 'Quality_Attribute_',
'evolve': 'Quality_Attribute_',
'evolvable': 'Quality_Attribute_',
'extensibility': 'Quality_Attribute_',
'extensible': 'Quality_Attribute_',
'fidelity': 'Quality_Attribute_',
'flexibility': 'Quality_Attribute_',
'flexible': 'Quality_Attribute_',
'inspectability': 'Quality_Attribute_',
'inspectable': 'Quality_Attribute_',
'installability': 'Quality_Attribute_',
'installable': 'Quality_Attribute_',
'integrity': 'Quality_Attribute_',
'integrate': 'Quality_Attribute_',
'interchangeability': 'Quality_Attribute_',
'interchangeable': 'Quality_Attribute_',
'interoperability': 'Quality_Attribute_',
'interoperable': 'Quality_Attribute_',
'latency': 'Quality_Attribute_',
'learnability': 'Quality_Attribute_',
'maintainability': 'Quality_Attribute_',
'maintainable': 'Quality_Attribute_',
'maintenance': 'Quality_Attribute_',
'manageability': 'Quality_Attribute_',
'manageable': 'Quality_Attribute_',
'mobility': 'Quality_Attribute_',
'modifiability': 'Quality_Attribute_',
'modifiable': 'Quality_Attribute_',
'modularity': 'Quality_Attribute_',
'operability': 'Quality_Attribute_',
'operable': 'Quality_Attribute_',
'orthogonality': 'Quality_Attribute_',
'portability': 'Quality_Attribute_',
'portable': 'Quality_Attribute_',
'precision': 'Quality_Attribute_',
'predictability': 'Quality_Attribute_',
'predictable': 'Quality_Attribute_',
'producibility': 'Quality_Attribute_',
'provability': 'Quality_Attribute_',
'recoverability': 'Quality_Attribute_',
'recoverable': 'Quality_Attribute_',
'reliability': 'Quality_Attribute_',
'reliable': 'Quality_Attribute_',
'reliably': 'Quality_Attribute_',
'repeatability': 'Quality_Attribute_',
'repeatable': 'Quality_Attribute_',
'reproducibility': 'Quality_Attribute_',
'resilience': 'Quality_Attribute_',
'responsiveness': 'Quality_Attribute_',
'reusability': 'Quality_Attribute_',
'reusable': 'Quality_Attribute_',
'reuse': 'Quality_Attribute_',
'robustness': 'Quality_Attribute_',
'robust': 'Quality_Attribute_',
'roundtrip': 'Quality_Attribute_',
'safety': 'Quality_Attribute_',
'safe': 'Quality_Attribute_',
'scalability': 'Quality_Attribute_',
'scale': 'Quality_Attribute_',
'scalable': 'Quality_Attribute_',
'scaling': 'Quality_Attribute_',
'seamlessness': 'Quality_Attribute_',
'sustainability': 'Quality_Attribute_',
'sustainable': 'Quality_Attribute_',
'serviceability': 'Quality_Attribute_',
'speed': 'Quality_Attribute_',
'supportability': 'Quality_Attribute_',
'securability': 'Quality_Attribute_',
'secure': 'Quality_Attribute_',
'security': 'Quality_Attribute_',
'simplicity': 'Quality_Attribute_',
'simple': 'Quality_Attribute_',
'stability': 'Quality_Attribute_',
'stable': 'Quality_Attribute_',
'survivability': 'Quality_Attribute_',
'tailorability': 'Quality_Attribute_',
'tailorable': 'Quality_Attribute_',
'tailor': 'Quality_Attribute_',
'testability': 'Quality_Attribute_',
'testable': 'Quality_Attribute_',
'throughput': 'Quality_Attribute_',
'traceability': 'Quality_Attribute_',
'traceable': 'Quality_Attribute_',
'transparency': 'Quality_Attribute_',
'transparent': 'Quality_Attribute_',
'ubiquity': 'Quality_Attribute_',
'understandability': 'Quality_Attribute_',
'understable': 'Quality_Attribute_',
'upgradability': 'Quality_Attribute_',
'upgradable': 'Quality_Attribute_',
'usability': 'Quality_Attribute_',
'usable': 'Quality_Attribute_',
'good': 'Quality_Attribute_',
'technology': 'Technology_',
'framework': 'Technology_',
'protocol': 'Technology_',
'standard': 'Technology_',
'provider': 'Technology_',
'library': 'Technology_',
'language': 'Technology_',
'message queues': 'Technology_',
'message queueing': 'Technology_',
'messaging system': 'Technology_',
'product': 'Technology_',
'toolkit': 'Technology_',
'kit': 'Technology_',
'sdk': 'Technology_',
'dbms': 'Technology_',
'rdbms': 'Technology_',
'format': 'Technology_',
'api': 'Technology_',
'stack': 'Technology_',
'.NET': 'Technology_',
'.NET Winform': 'Technology_',
'ActiveMQ': 'Technology_',
'activemq': 'Technology_',
'activex': 'Technology_',
'ADO.Net Data Service': 'Technology_',
'AJAX': 'Technology_',
'Amazon SQS': 'Technology_',
'AMQP': 'Technology_',
'amqp': 'Technology_',
'Apache Camel': 'Technology_',
'Apache MINA': 'Technology_',
'AppHarbor': 'Technology_',
'ASMX': 'Technology_',
'ASP.NET': 'Technology_',
'ASP.Net': 'Technology_',
'Asp.net': 'Technology_',
'asp.net': 'Technology_',
'ASP.NET MVC': 'Technology_',
'ASP.Net MVC': 'Technology_',
'Avro': 'Technology_',
'AWS': 'Technology_',
'Axis': 'Technology_',
'Backbone': 'Technology_',
'backbone': 'Technology_',
'BAPI': 'Technology_',
'BAPIs': 'Technology_',
'BERT': 'Technology_',
'BinaryFormatter': 'Technology_',
'Biztalk': 'Technology_',
'biztalk': 'Technology_',
'BSON': 'Technology_',
'ByteBuffers': 'Technology_',
'C': 'Technology_',
'C#': 'Technology_',
'celery': 'Technology_',
'Clojure': 'Technology_',
'CloudAMQP': 'Technology_',
'cloudControl': 'Technology_',
'com': 'Technology_',
'COM': 'Technology_',
'Compact Framework .NET': 'Technology_',
'Component Object Model (COM)': 'Technology_',
'CORBA': 'Technology_',
'CORBA ORB': 'Technology_',
'Crystal Reports': 'Technology_',
'cxf': 'Technology_',
'D-Bus': 'Technology_',
'Delphi': 'Technology_',
'Django': 'Technology_',
'Django/Python': 'Technology_',
'Duet': 'Technology_',
'EC2': 'Technology_',
'ECC6.0': 'Technology_',
'EJB': 'Technology_',
'EMS': 'Technology_',
'ems': 'Technology_',
'Enterprise Services Explorer': 'Technology_',
'Entity Framework': 'Technology_',
'ERPConnect': 'Technology_',
'ErpConnect': 'Technology_',
'Finagle': 'Technology_',
'finagle': 'Technology_',
'Finatra': 'Technology_',
'GNOME Bonobo': 'Technology_',
'grails': 'Technology_',
'groovy': 'Technology_',
'Groovy': 'Technology_',
'Groovy SOAP library': 'Technology_',
'Groovy WSLite': 'Technology_',
'groovy wslite': 'Technology_',
'GroovyWS': 'Technology_',
'groovyws': 'Technology_',
'hadoop': 'Technology_',
'Hazelcast': 'Technology_',
'Heroku': 'Technology_',
'http': 'Technology_',
'HTTP': 'Technology_',
'HTTP Builder': 'Technology_',
'HTTPS': 'Technology_',
'IronMQ': 'Technology_',
'IronWorker': 'Technology_',
'JIntegra': 'Technology_',
'java': 'Technology_',
'Java': 'Technology_',
'Java EE': 'Technology_',
'Java JMS': 'Technology_',
'JAX': 'Technology_',
'JAX-RPC': 'Technology_',
'JAX-WS': 'Technology_',
'JAX-RS': 'Technology_',
'jax-ws': 'Technology_',
'JBoss': 'Technology_',
'jdbc': 'Technology_',
'Jigsaw': 'Technology_',
'jms': 'Technology_',
'JMS': 'Technology_',
'jMSMQ': 'Technology_',
'JNI': 'Technology_',
'jni4net': 'Technology_',
'JS': 'Technology_',
'JSON': 'Technology_',
'JSON messaging': 'Technology_',
'KDE': 'Technology_',
'LINQ': 'Technology_',
'MassTransit': 'Technology_',
'MessagePack': 'Technology_',
'Microsoft sync frame work': 'Technology_',
'Microsoft WCF': 'Technology_',
'MINA': 'Technology_',
'MS-SQL 2005+': 'Technology_',
'MSMQ': 'Technology_',
'msmq': 'Technology_',
'MsmqJava': 'Technology_',
'MT': 'Technology_',
'MTOM': 'Technology_',
'MVC controllers': 'Technology_',
'native dll': 'Technology_',
'Netapi32.dll': 'Technology_',
'netty': 'Technology_',
'Netty': 'Technology_',
'Netty 4.x': 'Technology_',
'NServiceBus': 'Technology_',
'nservicebus': 'Technology_',
'nusoap': 'Technology_',
'ODBC': 'Technology_',
'OLEDB': 'Technology_',
'ORBit': 'Technology_',
'OSGi': 'Technology_',
'osgi': 'Technology_',
'PI': 'Technology_',
'PInvoke': 'Technology_',
'Play': 'Technology_',
'POX': 'Technology_',
'protobuf': 'Technology_',
'protobuf-net': 'Technology_',
'ProtoBuffers': 'Technology_',
'protobufs': 'Technology_',
'Protocol Buffer': 'Technology_',
'protocol buffers': 'Technology_',
'Protocol Buffers': 'Technology_',
'Protocol buffers': 'Technology_',
'ProtoRPC': 'Technology_',
'Protorpc': 'Technology_',
'RabbitMQ': 'Technology_',
'Rabbitmq': 'Technology_',
'rabbitmq': 'Technology_',
'RackSpace': 'Technology_',
'Rails': 'Technology_',
'Remoting': 'Technology_',
'remoting': 'Technology_',
'Rendezvous': 'Technology_',
'rest.li': 'Technology_',
'RESTEasy': 'Technology_',
'RESTExpress': 'Technology_',
'RestExpress': 'Technology_',
'RESTfull': 'Technology_',
'RESTFull': 'Technology_',
'RFC': 'Technology_',
'RFCs': 'Technology_',
'RIA': 'Technology_',
'RIA services': 'Technology_',
'RIA Services': 'Technology_',
'RMI': 'Technology_',
'Ruby': 'Technology_',
'RV': 'Technology_',
'rv': 'Technology_',
'rvd': 'Technology_',
'RX': 'Technology_',
'rx': 'Technology_',
'SAP': 'Technology_',
'SAP .NET connector': 'Technology_',
'SAP 4.6C': 'Technology_',
'SAP Connector': 'Technology_',
'SAP Connector for .NET': 'Technology_',
'Scala': 'Technology_',
'Scalatra': 'Technology_',
'ServerSocket': 'Technology_',
'Service Broker': 'Technology_',
'service broker': 'Technology_',
'SignalR': 'Technology_',
'Silverlight': 'Technology_',
'silverlight': 'Technology_',
'Silverlight enabled WCF Service': 'Technology_',
'Silverlight- enabled web service': 'Technology_',
'Silverlight-enabled WCF service': 'Technology_',
'silverlight-enabled wcf service': 'Technology_',
'Silverlight-enabled WCF services': 'Technology_',
'Silverlight-enabled web service': 'Technology_',
'Sitrion': 'Technology_',
'soap': 'Technology_',
'SOAP': 'Technology_',
'SOAP/HTTP': 'Technology_',
'SOAPlib': 'Technology_',
'sockets': 'Technology_',
'Spread': 'Technology_',
'spring': 'Technology_',
'Spring Remoting': 'Technology_',
'spring web services': 'Technology_',
'Spring WS': 'Technology_',
'springws': 'Technology_',
'springws grails plugin': 'Technology_',
'SQL': 'Technology_',
'SQL Database table': 'Technology_',
'SQL Server': 'Technology_',
'SQL server Service Broker': 'Technology_',
'SQS': 'Technology_',
'Storm': 'Technology_',
'StormMQ': 'Technology_',
'Struts': 'Technology_',
'Sun RPC': 'Technology_',
'tcp': 'Technology_',
'terracotta': 'Technology_',
'Terracotta': 'Technology_',
'Thrift': 'Technology_',
'Tibco': 'Technology_',
'tibco': 'Technology_',
'TIBCO': 'Technology_',
'Tibco EMS': 'Technology_',
'Tomcat': 'Technology_',
'transaction SOAMANAGER': 'Technology_',
'UDP': 'Technology_',
'udp': 'Technology_',
'Vert.x': 'Technology_',
'WCF': 'Technology_',
'Web service': 'Technology_',
'web service': 'Technology_',
'WebService': 'Technology_',
'WebServices': 'Technology_',
'WebSphere MQ': 'Technology_',
'Windows Management Instrumentation': 'Technology_',
'Windows Service': 'Technology_',
'WMI': 'Technology_',
'WPF': 'Technology_',
'WS': 'Technology_',
'WS-I Basic Profile': 'Technology_',
'ws security': 'Technology_',
'WSDL': 'Technology_',
'wsdl': 'Technology_',
'WSMQ': 'Technology_',
'XI': 'Technology_',
'Zend': 'Technology_',
'Zend Framework': 'Technology_',
'Zend SOAP': 'Technology_',
'ZeroMQ': 'Technology_',
'zeromq': 'Technology_',
'ZMQ': 'Technology_',
'ZSI': 'Technology_',
'jet': 'Technology_',
'mqtt': 'Technology_',
'redis': 'Technology_',
'wamp': 'Technology_',
'saml': 'Technology_',
'swing': 'Technology_',
'qpid': 'Technology_',
'xmpp': 'Technology_',
'.net': 'Technology_',
'.net-2.0': 'Technology_',
'.net 2.0': 'Technology_',
'.net 2': 'Technology_',
'.net-2': 'Technology_',
'.net-3.5': 'Technology_',
'.net 3.5': 'Technology_',
'.net-4.0': 'Technology_',
'.net 4.0': 'Technology_',
'.net-4': 'Technology_',
'.net 4': 'Technology_',
'.net-4.5': 'Technology_',
'.net 4.5': 'Technology_',
'access-vba': 'Technology_',
'access vba': 'Technology_',
'vba': 'Technology_',
'achartengine': 'Technology_',
'actionscript': 'Technology_',
'actionscript-2': 'Technology_',
'actionscript 2': 'Technology_',
'actionscript-3': 'Technology_',
'actionscript 3': 'Technology_',
'active-directory': 'Technology_',
'active directory': 'Technology_',
'ada': 'Technology_',
'ado': 'Technology_',
'ado.net': 'Technology_',
'aem': 'Technology_',
'afnetworking': 'Technology_',
'afnetworking-2': 'Technology_',
'afnetworking 2': 'Technology_',
'aggregation-framework': 'Technology_',
'air': 'Technology_',
'ajax': 'Technology_',
'akka': 'Technology_',
'alamofire': 'Technology_',
'alfresco': 'Technology_',
'amazon-cloudfront': 'Technology_',
'amazon cloudfront': 'Technology_',
'cloudfront': 'Technology_',
'amazon-dynamodb': 'Technology_',
'amazon dynamodb': 'Technology_',
'dynamodb': 'Technology_',
'amazon-ec2': 'Technology_',
'amazon ec2': 'Technology_',
'ec2': 'Technology_',
'amazon-redshift': 'Technology_',
'amazon redshift': 'Technology_',
'redshift': 'Technology_',
'amazon-s3': 'Technology_',
'amazon s3': 'Technology_',
's3': 'Technology_',
'amazon-web-services': 'Technology_',
'amazon web service': 'Technology_',
'anaconda': 'Technology_',
'andengine': 'Technology_',
'android-sqlite': 'Technology_',
'android-support-library': 'Technology_',
'android-xml': 'Technology_',
'angular2': 'Technology_',
'angularfire': 'Technology_',
'angularjs': 'Technology_',
'ansible': 'Technology_',
'ansible-playbook': 'Technology_',
'ansible playbook': 'Technology_',
'playbook': 'Technology_',
'antlr': 'Technology_',
'antlr4': 'Technology_',
'apache': 'Technology_',
'apache-camel': 'Technology_',
'apache camel': 'Technology_',
'camel': 'Technology_',
'apache-commons': 'Technology_',
'apache commons': 'Technology_',
'commons': 'Technology_',
'apache-httpclient-4.x': 'Technology_',
'apache httpclient': 'Technology_',
'apache-kafka': 'Technology_',
'apache kafka': 'Technology_',
'kafka': 'Technology_',
'apache-pig': 'Technology_',
'apache pig': 'Technology_',
'pig': 'Technology_',
'apache-poi': 'Technology_',
'apache poi': 'Technology_',
'poi': 'Technology_',
'apache-spark': 'Technology_',
'apache spark': 'Technology_',
'spark': 'Technology_',
'apache-spark-sql': 'Technology_',
'apache-storm': 'Technology_',
'apache2': 'Technology_',
'apex-code': 'Technology_',
'apex': 'Technology_',
'applescript': 'Technology_',
'applet': 'Technology_',
'arcgis': 'Technology_',
'arduino': 'Technology_',
'asihttprequest': 'Technology_',
'asp': 'Technology_',
'asp-classic': 'Technology_',
'asp.net-3.5': 'Technology_',
'asp.net-4.0': 'Technology_',
'asp.net-ajax': 'Technology_',
'asp.net-core': 'Technology_',
'asp.net-mvc': 'Technology_',
'asp.net-mvc-2': 'Technology_',
'asp.net-mvc-3': 'Technology_',
'asp.net-mvc-4': 'Technology_',
'asp.net-mvc-5': 'Technology_',
'asp.net-mvc-routing': 'Technology_',
'asp.net-web-api': 'Technology_',
'asp.net-web-api2': 'Technology_',
'aspectj': 'Technology_',
'autofac': 'Technology_',
'autohotkey': 'Technology_',
'autoit': 'Technology_',
'autolayout': 'Technology_',
'automapper': 'Technology_',
'avfoundation': 'Technology_',
'awk': 'Technology_',
'aws-sdk': 'Technology_',
'aws': 'Technology_',
'awt': 'Technology_',
'axapta': 'Technology_',
'axis': 'Technology_',
'axis2': 'Technology_',
'azure': 'Technology_',
'babeljs': 'Technology_',
'backbone.js': 'Technology_',
'beautifulsoup': 'Technology_',
'birt': 'Technology_',
'bison': 'Technology_',
'blade': 'Technology_',
'blender': 'Technology_',
'blogger': 'Technology_',
'boost': 'Technology_',
'boost-asio': 'Technology_',
'boost asio': 'Technology_',
'boto': 'Technology_',
'bouncycastle': 'Technology_',
'box2d': 'Technology_',
'breeze': 'Technology_',
'broadcastreceiver': 'Technology_',
'browserify': 'Technology_',
'bundler': 'Technology_',
'c': 'Technology_',
'c#': 'Technology_',
'c#-2.0': 'Technology_',
'c#-3.0': 'Technology_',
'c#-4.0': 'Technology_',
'c++': 'Technology_',
...}
%% Cell type:code id: tags:
``` python
# rename the columns of the document-term matrix to the relevant class
dtm.rename(columns=ontology_dict, inplace=True)
# aggregate the document-term matrix by the relevant class
dtm = dtm.groupby(by=dtm.columns, axis=1).sum()
```
%% Output
C:\Users\Jayesh\AppData\Local\Temp\ipykernel_14208\44085670.py:5: FutureWarning: DataFrame.groupby with axis=1 is deprecated. Do `frame.T.groupby(...)` without axis instead.
C:\Users\Jayesh\AppData\Local\Temp\ipykernel_13724\44085670.py:5: FutureWarning: DataFrame.groupby with axis=1 is deprecated. Do `frame.T.groupby(...)` without axis instead.
dtm = dtm.groupby(by=dtm.columns, axis=1).sum()
%% Cell type:code id: tags:
``` python
# save this to a new csv file
dtm.to_csv(r"E:\\DSSE\\DSSE-Group-7\\Assignment_2\\Week 2\\task2-iter2\\yarn_document_term_matrix_with_class.csv", index=False)
```
%% Cell type:code id: tags:
``` python
# drop the Issue key column
# dtm.drop(columns=["Issue key"], inplace=True)
```
%% Cell type:code id: tags:
``` python
# move 'Issue key' column to the front
cols = list(dtm.columns)
cols.remove('Issue key')
dtm = dtm[['Issue key'] + cols]
```
%% Cell type:code id: tags:
``` python
# Now proceed with LDA
# LDA Parameters
TOPIC_COUNT = 8
TOPIC_COUNT = 6
ALPHA = 0.01
BETA = 0.01
MAX_ITER = 100
MAX_ITER = 70
TAU_0 = 1.0
# dtm_for_lda = dtm.drop(columns=["Issue key"], axis=1)
# Fit LDA model
lda = LatentDirichletAllocation(n_components=TOPIC_COUNT, doc_topic_prior=ALPHA, topic_word_prior=BETA,
max_iter=MAX_ITER, learning_method='online', learning_offset=TAU_0,
random_state=0)
lda.fit_transform(dtm.iloc[:, 1:])
```
%% Output
array([[1.69262018e-04, 3.99169387e-01, 5.99815041e-01, ...,
1.69262018e-04, 1.69262018e-04, 1.69262018e-04],
[3.47626089e-01, 8.32778148e-05, 6.51874244e-01, ...,
8.32778148e-05, 8.32778148e-05, 8.32778148e-05],
[1.88394876e-04, 2.64775738e-01, 1.88394876e-04, ...,
1.88394876e-04, 4.91167740e-01, 1.88394876e-04],
array([[1.63773338e-04, 9.99181133e-01, 1.63773338e-04, 1.63773338e-04,
1.63773338e-04, 1.63773338e-04],
[9.13827901e-01, 8.19269212e-05, 5.08932229e-02, 8.19269212e-05,
3.50330955e-02, 8.19269212e-05],
[1.69319336e-04, 1.69319336e-04, 9.99153403e-01, 1.69319336e-04,
1.69319336e-04, 1.69319336e-04],
...,
[3.51584918e-01, 4.33275563e-04, 4.33275563e-04, ...,
4.33275563e-04, 4.33275563e-04, 2.52431056e-01],
[4.19582863e-01, 3.34360906e-01, 1.33191263e-04, ...,
1.33191263e-04, 1.33191263e-04, 1.33191263e-04],
[2.49500998e-04, 1.26157962e-01, 3.10612983e-01, ...,
9.84989021e-02, 1.99489665e-01, 4.01558511e-02]])
[1.45981081e-01, 3.44115623e-04, 5.10725015e-01, 3.44115623e-04,
3.42261557e-01, 3.44115623e-04],
[3.50748806e-01, 2.36819884e-01, 3.64742071e-01, 4.74227853e-02,
1.33226752e-04, 1.33226752e-04],
[1.23894077e-01, 1.78979560e-01, 5.53463933e-01, 2.37755587e-04,
1.43186918e-01, 2.37755587e-04]])
%% Cell type:code id: tags:
``` python
# print topics of result
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
topic_sum = topic.sum()
topic_num = "Topic #%d: " % topic_idx
top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
weights = [topic[i]/topic_sum for i in topic.argsort()[:-n_top_words - 1:-1]]
terms = " ".join([f"{word} ({weight:.2f})" for word, weight in zip(top_words, weights)])
print(topic_num)
print(terms)
print()
print()
print_top_words(lda, dtm.columns[1:], 20)
```
%% Output
Topic #0:
Component_ (0.16) Technology_ (0.06) Connector_Data_ (0.05) Connector_ (0.05) Pattern_ (0.02) ProgrammingConcept_ (0.02) SoftwareArtifact_ (0.02) DigitalResource_ (0.02) Quality_Attribute_ (0.02) run (0.01) Requirement_ (0.01) state (0.01) start (0.01) time (0.01) need (0.01) use (0.01) restart (0.01) web (0.01) rest (0.01) provide (0.01)
Component_ (0.10) Pattern_ (0.06) Connector_ (0.06) Technology_ (0.05) SoftwareArtifact_ (0.04) Connector_Data_ (0.04) Requirement_ (0.03) ProgrammingConcept_ (0.02) DigitalResource_ (0.02) Quality_Attribute_ (0.02) info (0.01) use (0.01) timeline (0.01) need (0.01) time (0.01) rest (0.01) web (0.01) ui (0.01) provide (0.01) make (0.01)
Topic #1:
Pattern_ (0.19) Connector_ (0.04) Component_ (0.03) DigitalResource_ (0.02) Requirement_ (0.02) info (0.02) SoftwareArtifact_ (0.02) configuration (0.02) scheduler (0.02) Technology_ (0.02) capacity (0.01) support (0.01) create (0.01) set (0.01) parent (0.01) Quality_Attribute_ (0.01) use (0.01) limit (0.01) federation (0.01) rest (0.01)
Pattern_ (0.08) Component_ (0.05) Requirement_ (0.02) DigitalResource_ (0.02) Dominant Topic (0.02) support (0.02) configuration (0.02) capacity (0.02) set (0.02) scheduler (0.01) use (0.01) Connector_ (0.01) cpu (0.01) different (0.01) property (0.01) type (0.01) parent (0.01) memory (0.01) Quality_Attribute_ (0.01) need (0.01)
Topic #2:
Technology_ (0.10) SoftwareArtifact_ (0.04) Requirement_ (0.03) DigitalResource_ (0.03) Component_ (0.02) Connector_ (0.02) container (0.02) use (0.02) support (0.02) ProgrammingConcept_ (0.01) Quality_Attribute_ (0.01) type (0.01) need (0.01) work (0.01) provide (0.01) implementation (0.01) Connector_Data_ (0.01) local (0.01) launch (0.01) disk (0.01)
Dominant Topic (0.12) Technology_ (0.08) Component_ (0.06) container (0.03) DigitalResource_ (0.02) cluster (0.02) Connector_ (0.02) Connector_Data_ (0.02) support (0.02) Pattern_ (0.01) Requirement_ (0.01) schedule (0.01) Quality_Attribute_ (0.01) use (0.01) jira (0.01) preemption (0.01) allocate (0.01) track (0.01) need (0.01) allow (0.01)
Topic #3:
Component_ (0.10) Technology_ (0.04) Connector_Data_ (0.03) Connector_ (0.03) Requirement_ (0.03) label (0.03) cluster (0.02) use (0.02) support (0.02) SoftwareArtifact_ (0.02) DigitalResource_ (0.02) placement (0.02) need (0.01) level (0.01) timeline (0.01) aggregation (0.01) flow (0.01) aggregate (0.01) collector (0.01) constraint (0.01)
Component_ (0.16) Technology_ (0.08) container (0.05) Connector_Data_ (0.03) run (0.02) Connector_ (0.02) DigitalResource_ (0.02) state (0.01) SoftwareArtifact_ (0.01) restart (0.01) start (0.01) launch (0.01) label (0.01) Requirement_ (0.01) need (0.01) Quality_Attribute_ (0.01) use (0.01) manager (0.01) kill (0.01) support (0.01)
Topic #4:
Technology_ (0.08) Component_ (0.07) memory (0.04) DigitalResource_ (0.03) cpu (0.03) support (0.02) Connector_Data_ (0.02) gpu (0.02) upgrade (0.02) core (0.02) attribute (0.01) use (0.01) nan (0.01) usage (0.01) schedule (0.01) gb (0.01) delay (0.01) Connector_ (0.01) make (0.01) Quality_Attribute_ (0.01)
Technology_ (0.07) SoftwareArtifact_ (0.04) Connector_ (0.03) local (0.03) storage (0.02) disk (0.02) directory (0.02) image (0.02) use (0.02) ProgrammingConcept_ (0.02) type (0.02) path (0.02) create (0.02) flow (0.02) decommission (0.02) implementation (0.02) Dominant Topic (0.01) delete (0.01) need (0.01) run (0.01)
Topic #5:
container (0.26) Component_ (0.09) Connector_Data_ (0.05) preemption (0.04) run (0.03) allocate (0.03) kill (0.03) allocation (0.02) preempt (0.02) opportunistic (0.02) Pattern_ (0.01) reserve (0.01) allow (0.01) long (0.01) need (0.01) currently (0.01) increase (0.01) Connector_ (0.01) reservation (0.01) guaranteed (0.01)
Topic #6:
Component_ (0.14) Technology_ (0.06) scheduler (0.05) schedule (0.04) cluster (0.03) fair (0.02) Quality_Attribute_ (0.02) priority (0.02) Pattern_ (0.02) Connector_ (0.02) capacity (0.02) use (0.01) DigitalResource_ (0.01) currently (0.01) run (0.01) support (0.01) large (0.01) Requirement_ (0.01) base (0.01) time (0.01)
Topic #7:
Technology_ (0.10) Connector_ (0.05) jira (0.04) DigitalResource_ (0.04) track (0.03) support (0.03) reservation (0.02) decommission (0.02) umbrella (0.01) common (0.01) error (0.01) proposes (0.01) child (0.01) metric (0.01) yarnclient (0.01) profile (0.01) reservationsystem (0.01) transition (0.01) dns (0.01) help (0.01)
Dominant Topic (0.18) scheduler (0.07) Connector_ (0.03) fair (0.03) priority (0.03) reservation (0.03) support (0.02) capacity (0.02) SoftwareArtifact_ (0.02) Pattern_ (0.02) jira (0.02) reserve (0.01) roll (0.01) DigitalResource_ (0.01) schedule (0.01) use (0.01) design (0.01) currently (0.01) activity (0.01) reservationsystem (0.01)
%% Cell type:code id: tags:
``` python
# perplextiy
print(f"Perplexity: {lda.perplexity(dtm.iloc[:, 1:])}")
```
%% Output
Perplexity: 574.937022778021
%% Cell type:code id: tags:
``` python
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
```
%% Cell type:code id: tags:
``` python
import pickle
# load vectorizer
with open("E:\DSSE\DSSE-Group-7\Assignment_2\Week 2\yarn_vectorizer.pkl", "rb") as f:
vectorizer = pickle.load(f)
feature_names = vectorizer.get_feature_names_out()
topic_term_matrix = lda.components_
topics = [[feature_names[i] for i in topic.argsort()[:-20 - 1:-1]] for topic in topic_term_matrix]
dictionary = Dictionary([dtm.columns.tolist()])
corpus = [dictionary.doc2bow([word]) for word in dtm.columns.tolist()]
coherence_model_lda = CoherenceModel(topics=topics, texts=[dtm.columns[1:].tolist()], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_lda}")
```
%% Output
<>:4: DeprecationWarning: invalid escape sequence '\D'
<>:4: DeprecationWarning: invalid escape sequence '\D'
C:\Users\Jayesh\AppData\Local\Temp\ipykernel_14208\1335506831.py:4: DeprecationWarning: invalid escape sequence '\D'
with open("E:\DSSE\DSSE-Group-7\Assignment_2\Week 2\yarn_vectorizer.pkl", "rb") as f:
Coherence Score: 0.5968485427897414
%% Cell type:code id: tags:
``` python
import numpy as np
# Assuming lda is your trained LDA model and dtm is your document-term matrix
# Transform document-term matrix into document-topic distribution matrix
document_topic_distribution = lda.transform(dtm.iloc[:, 1:])
# Determine dominant topics for each issue
dominant_topics = np.argmax(document_topic_distribution, axis=1)
# Calculate topic proportions per issue
topic_proportions_per_issue = np.max(document_topic_distribution, axis=1)
# Visualize topic proportions (optional)
import matplotlib.pyplot as plt
plt.hist(topic_proportions_per_issue, bins=np.linspace(0, 1, 11))
plt.xlabel('Topic Proportion')
plt.ylabel('Number of Issues')
plt.title('Distribution of Topic Proportions')
plt.show()
# Assign dominant topics to issues
dtm['Dominant Topic'] = dominant_topics
# Save the document-term matrix with dominant topics
dtm.to_csv(r"E:\\DSSE\\DSSE-Group-7\\Assignment_2\\Week 2\\task2-iter2\\yarn_document_term_matrix_with_dominant_topics.csv", index=False)
```
%% Output
C:\Users\Jayesh\AppData\Local\Temp\ipykernel_14208\986355751.py:23: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\Jayesh\AppData\Local\Temp\ipykernel_13724\986355751.py:23: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
dtm['Dominant Topic'] = dominant_topics
%% Cell type:code id: tags:
``` python
import pickle
# load vectorizer
with open("E:\DSSE\DSSE-Group-7\Assignment_2\Week 2\yarn_vectorizer.pkl", "rb") as f:
vectorizer = pickle.load(f)
feature_names = vectorizer.get_feature_names_out()
def do_lda(n_topics=8, iterations=10):
TOPIC_COUNT = n_topics
ALPHA = 0.01
BETA = 0.01
MAX_ITER = iterations
TAU_0 = 1.0
# dtm_for_lda = dtm.drop(columns=["Issue key"], axis=1)
# Fit LDA model
lda = LatentDirichletAllocation(n_components=TOPIC_COUNT, doc_topic_prior=ALPHA, topic_word_prior=BETA,
max_iter=MAX_ITER, learning_method='online', learning_offset=TAU_0,
random_state=0)
lda.fit_transform(dtm.iloc[:, 1:])
topic_term_matrix = lda.components_
topics = [[feature_names[i] for i in topic.argsort()[:-20 - 1:-1]] for topic in topic_term_matrix]
dictionary = Dictionary([dtm.columns.tolist()])
coherence_model_lda = CoherenceModel(topics=topics, texts=[dtm.columns[1:].tolist()], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
return lda.perplexity(dtm.iloc[:, 1:]), coherence_lda
# Perform grid search for optimal number of topics
results = []
for topic in range(3, 13):
perplexity, coherence = do_lda(topic, 100)
results.append((topic, perplexity, coherence))
results = pd.DataFrame(results, columns=['Topics', 'Perplexity', 'Coherence'])
```
%% Cell type:code id: tags:
``` python
results.head(10)
```
%% Output
Topics Perplexity Coherence
0 3 520.520313 0.578289
1 4 510.312184 0.578519
2 5 510.284144 0.588379
3 6 506.440757 0.616076
4 7 510.771170 0.609888
5 8 507.938855 0.623681
6 9 507.536231 0.618575
7 10 507.532783 0.635756
8 11 504.534852 0.641812
9 12 507.308382 0.639409
%% Cell type:code id: tags:
``` python
#plot results with matplotlib
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots()
color = 'tab:red'
ax1.set_xlabel('Topics')
ax1.set_ylabel('Perplexity', color=color)
ax1.plot(results['Topics'], results['Perplexity'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Coherence', color=color)
ax2.plot(results['Topics'], results['Coherence'], color=color)
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
# find the optimal number of iterations
results = []
for iteration in range(10, 101, 10):
perplexity, coherence = do_lda(6, iteration)
results.append((iteration, perplexity, coherence))
results = pd.DataFrame(results, columns=['Iterations', 'Perplexity', 'Coherence'])
results.head(10)
```
%% Output
Iterations Perplexity Coherence
0 10 514.366710 0.602533
1 20 510.587385 0.613113
2 30 508.842389 0.609688
3 40 508.159335 0.614432
4 50 507.606386 0.614432
5 60 507.133144 0.614432
6 70 506.846351 0.616076
7 80 506.698547 0.616076
8 90 506.607472 0.616076
9 100 506.440757 0.616076
%% Cell type:code id: tags:
``` python
# plot results
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots()
color = 'tab:red'
ax1.set_xlabel('Iterations')
ax1.set_ylabel('Perplexity', color=color)
ax1.plot(results['Iterations'], results['Perplexity'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Coherence', color=color)
ax2.plot(results['Iterations'], results['Coherence'], color=color)
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.show()
```
%% Output
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment