pyspark-tools — Security Grade D

Showing 1–30 of 80 findings

high (7)

medium (9)

low (14)

«‹

80 findings

high (10)

AIVSS 6.0CVSS 7.1

Tool handlers use module-level AWS clients (self.s3_client, self.glue_client) initialized from os.getenv() without consulting caller identity or per-request credentials, enabling confused deputy attacks on S3 and Glue resources.

Evidence

"""FastMCP server for SQL to PySpark conversion with code review and optimization."""

import json
import os
import re
from typing import Any, Dict, List, Optional, Union

from fastmcp import FastMCP

from .advanced_optimizer import AdvancedOptimizer
from .aws_glue_integration import (
    AWSGlueIntegration,
    DataCatalogTable,
    DataFormat,
    GlueJobConfig,
    GlueJobType,
)
from .batch_processor import BatchProcessor
from .code_reviewer import PySparkCodeReviewer
from .data_source_anal

RemediationAI

The problem is that module-level AWS clients (self.s3_client, self.glue_client) are initialized once from os.getenv() without validating caller identity, allowing any caller to use the server's ambient credentials to access S3 and Glue resources. Modify the FastMCP server to accept caller credentials (e.g., via request headers or context) and pass them to a new method that creates per-request boto3 clients using sts.assume_role() or explicit credential passing instead of relying on ambient credentials. This ensures each caller's requests are isolated to their own AWS identity and permissions. Verify by adding logging to confirm that each tool invocation creates a new client with the caller's credentials and that cross-caller access is denied.

LLM consensus

✓ bedrock-claude-haiku-4-5

1	"""FastMCP server for SQL to PySpark conversion with code review and optimization."""
2
3	import json
4	import os
5	import re
6	from typing import Any, Dict, List, Optional, Union
7
8	from fastmcp import FastMCP
9
10	from .advanced_optimizer import AdvancedOptimizer
11	from .aws_glue_integration import (
12	AWSGlueIntegration,
13	DataCatalogTable,
14	DataFormat,
15	GlueJobConfig,
16	GlueJobType,
17	)
18	from .batch_processor import BatchProcessor
19	from .code_reviewer import PySparkCodeReviewer
20	from .data_source_anal

57	def __init__(self):
58	self.s3_client = None
59	self.glue_client = None
60	self._initialize_aws_clients()
61
62	def _initialize_aws_clients(self):
63	"""Initialize AWS clients if credentials are available."""
64	if not BOTO3_AVAILABLE:
65	return
66
67	try:
68	self.s3_client = boto3.client("s3")
69	self.glue_client = boto3.client("glue")
70	except (NoCredentialsError, Exception):
71	# AWS clients will be None if no credentials
72

67	expires_at: Optional[str] = None
68
69
70	class MemoryManager:
71	"""SQLite-based memory manager for storing conversion history and context."""
72
73	def __init__(self, db_path: Optional[str] = None):
74	if db_path is None:
75	db_path = os.getenv(
76	"PYSPARK_TOOLS_DB_PATH",
77	os.path.expanduser("~/.cache/mcp/memory.sqlite"),
78	)
79	self.db_path = Path(db_path)
80	self.db_path.parent.mkdir(parents=True, exist_ok=True)
81	# Thread-local s

264	for column, column_type in required_columns.items():
265	if column not in columns:
266	conn.execute(
267	f"ALTER TABLE conversions ADD COLUMN {column} {column_type}"
268	)
269
270	conn.commit()

1	"""
2	Batch processor for handling multiple SQL files and directories with comprehensive
3	job management, status tracking, and error handling.

44	@dataclass
45	class PerformanceMetric:
46	"""Represents a performance metric."""
47
48	id: Optional[int]

2813	return result
2814
2815	except Exception as e:
2816	return {"status": "error", "message": str(e)}
2817
2818
2819	@app.tool()

2783	return result
2784
2785	except Exception as e:
2786	return {"status": "error", "message": str(e)}
2787
2788
2789	@app.tool()

2061	}
2062
2063	except Exception as e:
2064	return {"status": "error", "message": str(e)}
2065
2066
2067	@app.tool()

1004	}
1005
1006	except Exception as e:
1007	return {"status": "error", "message": str(e)}
1008
1009	def _get_input_format(self, format: Optional[DataFormat]) -> str:
1010	"""Get input format for Glue table definition."""

657	return template
658
659	except Exception as e:
660	return {"status": "error", "message": str(e)}
661
662	def generate_data_catalog_table_definition(
663	self,

871	}
872
873	except Exception as e:
874	return {"status": "error", "message": str(e)}
875
876	def generate_schema_evolution_strategy(
877	self,

237	}
238
239	except Exception as e:
240	return {"status": "error", "message": str(e)}
241
242	def _generate_imports(self, config: GlueJobConfig) -> str:
243	"""Generate import statements based on configuration."""

531	}
532
533	except Exception as e:
534	return {"status": "error", "message": str(e)}
535
536	def _extract_dataframe_operations(self, pyspark_code: str) -> List[str]:
537	"""Extract DataFrame operations from PySpark code."""

3070	return result
3071
3072	except Exception as e:
3073	return {"status": "error", "message": str(e)}
3074
3075
3076	@app.tool()

1641	}
1642
1643	except Exception as e:
1644	return {"status": "error", "message": str(e)}
1645
1646
1647	@app.tool()

3100	return result
3101
3102	except Exception as e:
3103	return {"status": "error", "message": str(e)}
3104
3105
3106	@app.tool()

1713	return result
1714
1715	except Exception as e:
1716	return {"status": "error", "message": str(e)}
1717
1718
1719	@app.tool()

3019	return result
3020
3021	except Exception as e:
3022	return {"status": "error", "message": str(e)}
3023
3024
3025	@app.tool()

2173	}
2174
2175	except Exception as e:
2176	return {"status": "error", "message": str(e)}
2177
2178
2179	@app.tool()

1491	}
1492
1493	except Exception as e:
1494	return {"status": "error", "message": str(e)}
1495
1496	def generate_small_files_consolidation_job(
1497	self,

1932	}
1933
1934	except Exception as e:
1935	return {"status": "error", "message": str(e)}
1936
1937
1938	@app.tool()

2638	return result
2639
2640	except Exception as e:
2641	return {"status": "error", "message": str(e)}
2642
2643
2644	@app.tool()

1623	}
1624
1625	except Exception as e:
1626	return {"status": "error", "message": str(e)}
1627
1628	def _analyze_partitioning_strategy(
1629	self, table_info: DataCatalogTable, query_patterns: Optional[List[str]]

pypi · pyspark-tools0.0.4

LLM summary— pyspark-tools:0.0.4

Signal scores

LLM judge panel

CVE check

Community trust

Scan details

MCP Server Information

Run this exact engine on your private repo — before users see it on /scan.

high (7)

medium (9)

low (14)

high (10)

medium (56)

low (14)

Severity breakdown

2244	}
2245
2246	except Exception as e:
2247	return {"status": "error", "message": str(e)}
2248
2249
2250	@app.tool()

1903	}
1904
1905	except Exception as e:
1906	return {"status": "error", "message": str(e)}
1907
1908
1909	@app.tool()

3151	return result
3152
3153	except Exception as e:
3154	return {"status": "error", "message": str(e)}
3155
3156
3157	@app.tool()

3228	}
3229
3230	except Exception as e:
3231	return {"status": "error", "message": str(e)}
3232
3233
3234	# =============================================================================

1843	}
1844
1845	except Exception as e:
1846	return {"status": "error", "message": str(e), "pdf_path": pdf_path}
1847
1848
1849	@app.tool()

1614	return {"status": "success", "message": f"Context stored with key: {key}"}
1615
1616	except Exception as e:
1617	return {"status": "error", "message": str(e)}
1618
1619
1620	@app.tool()

1808	}
1809
1810	except Exception as e:
1811	return {"status": "error", "message": str(e)}
1812
1813
1814	@app.tool()

1422	}
1423
1424	except Exception as e:
1425	return {"status": "error", "message": str(e)}
1426
1427	def generate_s3_optimization_strategy(
1428	self,

766	}
767
768	except Exception as e:
769	return {"status": "error", "message": str(e)}
770
771	def detect_schema_from_sample_data(
772	self,

2914	return result
2915
2916	except Exception as e:
2917	return {"status": "error", "message": str(e)}
2918
2919
2920	@app.tool()